From dd17c8f72993f9461e9c19250e3f155d6d99df22 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Oct 2009 22:34:15 +0900 Subject: percpu: remove per_cpu__ prefix. Now that the return from alloc_percpu is compatible with the address of per-cpu vars, it makes sense to hand around the address of per-cpu variables. To make this sane, we remove the per_cpu__ prefix we used created to stop people accidentally using these vars directly. Now we have sparse, we can use that (next patch). tj: * Updated to convert stuff which were missed by or added after the original patch. * Kill per_cpu_var() macro. Signed-off-by: Rusty Russell Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter --- arch/x86/kernel/apic/nmi.c | 6 +++--- arch/x86/kernel/head_32.S | 6 +++--- arch/x86/kernel/vmlinux.lds.S | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index e631cc4416f..45404379d17 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -437,8 +437,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) * Ayiee, looks like this CPU is stuck ... * wait a few IRQs (5 seconds) before doing the oops ... */ - __this_cpu_inc(per_cpu_var(alert_counter)); - if (__this_cpu_read(per_cpu_var(alert_counter)) == 5 * nmi_hz) + __this_cpu_inc(alert_counter); + if (__this_cpu_read(alert_counter) == 5 * nmi_hz) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ @@ -446,7 +446,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) regs, panic_on_timeout); } else { __get_cpu_var(last_irq_sum) = sum; - __this_cpu_write(per_cpu_var(alert_counter), 0); + __this_cpu_write(alert_counter, 0); } /* see if the nmi watchdog went off */ diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 050c278481b..fd39eaf83b8 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -438,8 +438,8 @@ is386: movl $2,%ecx # set MP */ cmpb $0,ready jne 1f - movl $per_cpu__gdt_page,%eax - movl $per_cpu__stack_canary,%ecx + movl $gdt_page,%eax + movl $stack_canary,%ecx movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) shrl $16, %ecx movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) @@ -702,7 +702,7 @@ idt_descr: .word 0 # 32 bit align gdt_desc.address ENTRY(early_gdt_descr) .word GDT_ENTRIES*8-1 - .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ + .long gdt_page /* Overwritten for secondary CPUs */ /* * The boot_gdt must mirror the equivalent in setup.S and is diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 92929fb3f9f..ecb92717c41 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -312,7 +312,7 @@ SECTIONS * Per-cpu symbols which need to be offset from __per_cpu_load * for the boot processor. */ -#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load +#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load INIT_PER_CPU(gdt_page); INIT_PER_CPU(irq_stack_union); @@ -323,7 +323,7 @@ INIT_PER_CPU(irq_stack_union); "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -. = ASSERT((per_cpu__irq_stack_union == 0), +. = ASSERT((irq_stack_union == 0), "irq_stack_union is not at start of per-cpu area"); #endif -- cgit v1.2.3-70-g09d2 From b76365a18f7593c9df32a74bf2b4a39741b67bc6 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Thu, 17 Dec 2009 10:53:25 -0600 Subject: x86, uv: Add serial number parameter to uv_bios_get_sn_info() Add system_serial_number to the information returned by uv_bios_get_sn_info() UV BIOS call. Signed-off-by: Russ Anderson LKML-Reference: <20091217165323.GA30774@sgi.com> Cc: Jack Steiner Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uv/bios.h | 7 ++++--- arch/x86/kernel/apic/x2apic_uv_x.c | 6 +++--- arch/x86/kernel/bios_uv.c | 20 ++++++++++++++------ 3 files changed, 21 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 2751f3075d8..3fbc1f348a7 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -18,8 +18,8 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson + * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson */ #include @@ -89,7 +89,7 @@ extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); -extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); +extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int, unsigned long *); @@ -104,6 +104,7 @@ extern int uv_type; extern long sn_partition_id; extern long sn_coherency_id; extern long sn_region_size; +extern long system_serial_number; #define partition_coherence_id() (sn_coherency_id) extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index b684bb303cb..af5d103bb53 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved. */ #include #include @@ -627,8 +627,8 @@ void __init uv_system_init(void) } uv_bios_init(); - uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, - &sn_coherency_id, &sn_region_size); + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, + &sn_region_size, &system_serial_number); uv_rtc_init(); for_each_present_cpu(cpu) { diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index b0206a211b0..c918ebab52a 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -15,8 +15,8 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson + * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson */ #include @@ -30,6 +30,7 @@ static struct uv_systab uv_systab; s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) { struct uv_systab *tab = &uv_systab; + s64 ret; if (!tab->function) /* @@ -37,9 +38,11 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) */ return BIOS_STATUS_UNIMPLEMENTED; - return efi_call6((void *)__va(tab->function), - (u64)which, a1, a2, a3, a4, a5); + ret = efi_call6((void *)__va(tab->function), (u64)which, + a1, a2, a3, a4, a5); + return ret; } +EXPORT_SYMBOL_GPL(uv_bios_call); s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) @@ -73,11 +76,14 @@ long sn_coherency_id; EXPORT_SYMBOL_GPL(sn_coherency_id); long sn_region_size; EXPORT_SYMBOL_GPL(sn_region_size); +long system_serial_number; +EXPORT_SYMBOL_GPL(system_serial_number); int uv_type; +EXPORT_SYMBOL_GPL(uv_type); s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, - long *region) + long *region, long *ssn) { s64 ret; u64 v0, v1; @@ -97,8 +103,11 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, *coher = part.coherence_id; if (region) *region = part.region_size; + if (ssn) + *ssn = v1; return ret; } +EXPORT_SYMBOL_GPL(uv_bios_get_sn_info); int uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, @@ -185,4 +194,3 @@ void uv_bios_init(void) void uv_bios_init(void) { } #endif - -- cgit v1.2.3-70-g09d2 From 9959c888a38b0f25b0e81a480f537d6489348442 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 28 Dec 2009 21:08:29 -0800 Subject: x86: Increase NR_IRQS and nr_irqs I have a system with lots of igb and ixgbe, when iov/vf are enabled for them, we hit the limit of 3064. when system has 20 pcie installed, and one card has 2 functions, and one function needs 64 msi-x, may need 20 * 2 * 64 = 2560 for msi-x but if iov and vf are enabled may need 20 * 2 * 64 * 3 = 7680 for msi-x assume system with 5 ioapic, nr_irqs_gsi will be 120. NR_CPUS = 512, and nr_cpu_ids = 128 will have NR_IRQS = 256 + 512 * 64 = 33024 will have nr_irqs = 120 + 8 * 128 + 120 * 64 = 8824 When SPARSE_IRQ is not set, there is no increase with kernel data size. when NR_CPUS=128, and SPARSE_IRQ is set: text data bss dec hex filename 21837444 4216564 12480736 38534744 24bfe58 vmlinux.before 21837442 4216580 12480736 38534758 24bfe66 vmlinux.after when NR_CPUS=4096, and SPARSE_IRQ is set text data bss dec hex filename 21878619 5610244 13415392 40904255 270263f vmlinux.before 21878617 5610244 13415392 40904253 270263d vmlinux.after Signed-off-by: Yinghai Lu Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <4B398ECD.1080506@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_vectors.h | 12 ++++++------ arch/x86/kernel/apic/io_apic.c | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 4611f085cd4..3ab43df089c 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -154,21 +154,21 @@ static inline int invalid_vm86_irq(int irq) #define NR_IRQS_LEGACY 16 -#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS ) #define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) #ifdef CONFIG_X86_IO_APIC # ifdef CONFIG_SPARSE_IRQ +# define CPU_VECTOR_LIMIT (64 * NR_CPUS) # define NR_IRQS \ (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ (NR_VECTORS + CPU_VECTOR_LIMIT) : \ (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) # else -# if NR_CPUS < MAX_IO_APICS -# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT) -# else -# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) -# endif +# define CPU_VECTOR_LIMIT (32 * NR_CPUS) +# define NR_IRQS \ + (CPU_VECTOR_LIMIT < IO_APIC_VECTOR_LIMIT ? \ + (NR_VECTORS + CPU_VECTOR_LIMIT) : \ + (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) # endif #else /* !CONFIG_X86_IO_APIC: */ # define NR_IRQS NR_IRQS_LEGACY diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index de00c4619a5..d9cd1f1b9c0 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3840,7 +3840,7 @@ int __init arch_probe_nr_irqs(void) /* * for MSI and HT dyn irq */ - nr += nr_irqs_gsi * 16; + nr += nr_irqs_gsi * 64; #endif if (nr < nr_irqs) nr_irqs = nr; -- cgit v1.2.3-70-g09d2 From 1b1d9258181bae199dc940f4bd0298126b9a73d9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 18 Dec 2009 16:12:56 +0000 Subject: x86-64: Modify copy_user_generic() alternatives mechanism In order to avoid unnecessary chains of branches, rather than implementing copy_user_generic() as a function consisting of just a single (possibly patched) branch, instead properly deal with patching call instructions in the alternative instructions framework, and move the patching into the callers. As a follow-on, one could also introduce something like __EXPORT_SYMBOL_ALT() to avoid patching call sites in modules. Signed-off-by: Jan Beulich Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <4B2BB8180200007800026AE7@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 7 ++++++- arch/x86/include/asm/uaccess_64.h | 21 ++++++++++++++++++++- arch/x86/kernel/alternative.c | 4 +++- arch/x86/kernel/x8664_ksyms_64.c | 3 ++- arch/x86/lib/copy_user_64.S | 6 ------ 5 files changed, 31 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 69b74a7b877..3b5b828767b 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -125,11 +125,16 @@ static inline void alternatives_smp_switch(int smp) {} asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ : output : "i" (0), ## input) +/* Like alternative_io, but for replacing a direct call with another one. */ +#define alternative_call(oldfunc, newfunc, feature, output, input...) \ + asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \ + : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) + /* * use this macro(s) if you need more than one output parameter * in alternative_io */ -#define ASM_OUTPUT2(a, b) a, b +#define ASM_OUTPUT2(a...) a struct paravirt_patch_site; #ifdef CONFIG_PARAVIRT diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 46324c6a4f6..a78c4030544 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include /* @@ -16,7 +18,24 @@ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long -copy_user_generic(void *to, const void *from, unsigned len); +copy_user_generic_string(void *to, const void *from, unsigned len); +__must_check unsigned long +copy_user_generic_unrolled(void *to, const void *from, unsigned len); + +static __always_inline __must_check unsigned long +copy_user_generic(void *to, const void *from, unsigned len) +{ + unsigned ret; + + alternative_call(copy_user_generic_unrolled, + copy_user_generic_string, + X86_FEATURE_REP_GOOD, + ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), + "=d" (len)), + "1" (to), "2" (from), "3" (len) + : "memory", "rcx", "r8", "r9", "r10", "r11"); + return ret; +} __must_check unsigned long _copy_to_user(void __user *to, const void *from, unsigned len); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index de7353c0ce9..2589ea4c60c 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -205,7 +205,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, struct alt_instr *end) { struct alt_instr *a; - char insnbuf[MAX_PATCH_LEN]; + u8 insnbuf[MAX_PATCH_LEN]; DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); for (a = start; a < end; a++) { @@ -223,6 +223,8 @@ void __init_or_module apply_alternatives(struct alt_instr *start, } #endif memcpy(insnbuf, a->replacement, a->replacementlen); + if (*insnbuf == 0xe8 && a->replacementlen == 5) + *(s32 *)(insnbuf + 1) += a->replacement - a->instr; add_nops(insnbuf + a->replacementlen, a->instrlen - a->replacementlen); text_poke_early(instr, insnbuf, a->instrlen); diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 619f7f88b8c..693920b2249 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -26,7 +26,8 @@ EXPORT_SYMBOL(__put_user_2); EXPORT_SYMBOL(__put_user_4); EXPORT_SYMBOL(__put_user_8); -EXPORT_SYMBOL(copy_user_generic); +EXPORT_SYMBOL(copy_user_generic_string); +EXPORT_SYMBOL(copy_user_generic_unrolled); EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index cf889d4e076..71100c98e33 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -90,12 +90,6 @@ ENTRY(_copy_from_user) CFI_ENDPROC ENDPROC(_copy_from_user) -ENTRY(copy_user_generic) - CFI_STARTPROC - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC -ENDPROC(copy_user_generic) - .section .fixup,"ax" /* must zero dest */ ENTRY(bad_from_user) -- cgit v1.2.3-70-g09d2 From ea94396629a3e0cb9a3a9c75335b1de255b30426 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Jan 2010 21:14:41 -0800 Subject: x86, apic: Don't waste a vector to improve vector spread We want to use a vector-assignment sequence that avoids stumbling onto 0x80 earlier in the sequence, in order to improve the spread of vectors across priority levels on machines with a small number of interrupt sources. Right now, this is done by simply making the first vector (0x31 or 0x41) completely unusable. This is unnecessary; all we need is to start assignment at a +1 offset, we don't actually need to prohibit the usage of this vector once we have wrapped around. Signed-off-by: H. Peter Anvin LKML-Reference: <4B426550.6000209@kernel.org> --- arch/x86/include/asm/irq_vectors.h | 9 +++++---- arch/x86/kernel/apic/io_apic.c | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index dbc81acb7e9..585a42810cf 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -133,11 +133,12 @@ #define MCE_SELF_VECTOR 0xeb /* - * First APIC vector available to drivers: (vectors 0x30-0xee) we - * start at 0x31 to spread out vectors evenly between priority - * levels. (0x80 is the syscall vector) + * First APIC vector available to drivers: (vectors 0x30-0xee). We + * start allocating at 0x31 to spread out vectors evenly between + * priority levels. (0x80 is the syscall vector) */ -#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2) +#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 1) +#define VECTOR_OFFSET_START 1 #define NR_VECTORS 256 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index d9cd1f1b9c0..e9ba0903e9d 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1162,7 +1162,8 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; + static int current_vector = FIRST_DEVICE_VECTOR + VECTOR_OFFSET_START; + static int current_offset = VECTOR_OFFSET_START % 8; unsigned int old_vector; int cpu, err; cpumask_var_t tmp_mask; -- cgit v1.2.3-70-g09d2 From 99659a929d653d0c9ce458091870544768add871 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 7 Jan 2010 15:35:42 +0100 Subject: x86, uv: Remove recursion in uv_heartbeat_enable() The recursion is not needed and does not improve readability. Signed-off-by: Roel Kluin LKML-Reference: <4B45F13E.3040202@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/x2apic_uv_x.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index af5d103bb53..d199dc34f54 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -475,7 +475,7 @@ static void uv_heartbeat(unsigned long ignored) static void __cpuinit uv_heartbeat_enable(int cpu) { - if (!uv_cpu_hub_info(cpu)->scir.enabled) { + while (!uv_cpu_hub_info(cpu)->scir.enabled) { struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); @@ -483,11 +483,10 @@ static void __cpuinit uv_heartbeat_enable(int cpu) timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; add_timer_on(timer, cpu); uv_cpu_hub_info(cpu)->scir.enabled = 1; - } - /* check boot cpu */ - if (!uv_cpu_hub_info(0)->scir.enabled) - uv_heartbeat_enable(0); + /* also ensure that boot cpu is enabled */ + cpu = 0; + } } #ifdef CONFIG_HOTPLUG_CPU -- cgit v1.2.3-70-g09d2 From 2ca49b2fcf5813571663c3c4c894b78148c43690 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 4 Jan 2010 09:47:35 -0500 Subject: x86: Macroise x86 cache descriptors Use a macro to define the cache sizes when cachesize > 1 MB. This is less typing, and less prone to introducing bugs like we saw in e02e0e1a130b9ca37c5186d38ad4b3aaf58bb149, and means we don't have to do maths when adding new non-power-of-2 updates like those seen recently. Signed-off-by: Dave Jones LKML-Reference: <20100104144735.GA18390@redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 84 ++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 41 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index fc6c8ef92dc..c2b722d5a72 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -31,6 +31,8 @@ struct _cache_table { short size; }; +#define MB(x) ((x) * 1024) + /* All the cache descriptor types we care about (no TLB or trace cache entries) */ @@ -44,9 +46,9 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x29, LVL_3, 4096 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ @@ -59,16 +61,16 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ - { 0x44, LVL_2, 1024 }, /* 4-way set assoc, 32 byte line size */ - { 0x45, LVL_2, 2048 }, /* 4-way set assoc, 32 byte line size */ - { 0x46, LVL_3, 4096 }, /* 4-way set assoc, 64 byte line size */ - { 0x47, LVL_3, 8192 }, /* 8-way set assoc, 64 byte line size */ - { 0x49, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ - { 0x4a, LVL_3, 6144 }, /* 12-way set assoc, 64 byte line size */ - { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ - { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */ - { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */ - { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */ + { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */ + { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ + { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ + { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ + { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ + { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ + { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ + { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */ + { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */ + { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */ { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ @@ -77,34 +79,34 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ - { 0x78, LVL_2, 1024 }, /* 4-way set assoc, 64 byte line size */ - { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7c, LVL_2, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7d, LVL_2, 2048 }, /* 8-way set assoc, 64 byte line size */ - { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ - { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ - { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ - { 0x84, LVL_2, 1024 }, /* 8-way set assoc, 32 byte line size */ - { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */ - { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */ - { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ - { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ - { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ - { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */ - { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ - { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ - { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ - { 0xde, LVL_3, 8192 }, /* 12-way set assoc, 64 byte line size */ - { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ - { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ - { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ - { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */ - { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */ - { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */ + { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ + { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ + { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ + { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ + { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ + { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ + { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ + { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */ + { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ + { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */ + { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ + { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */ + { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */ + { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */ + { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */ + { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ + { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */ + { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ + { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */ + { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */ + { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ + { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ + { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */ + { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */ + { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */ { 0x00, 0, 0} }; -- cgit v1.2.3-70-g09d2 From 0f1d683fb35d6c6f49ef696c95757f3970682a0e Mon Sep 17 00:00:00 2001 From: Naga Chumbalkar Date: Thu, 17 Dec 2009 20:18:27 +0000 Subject: [CPUFREQ] Processor Clocking Control interface driver Processor Clocking Control (PCC) is an interface between the BIOS and OSPM. Based on the server workload, OSPM can request what frequency it expects from a logical CPU, and the BIOS will achieve that frequency transparently. This patch introduces driver support for PCC. OSPM uses the PCC driver to communicate with the BIOS via the PCC interface. There is a Documentation file that provides a link to the PCC Specification, and also provides a summary of the PCC interface. Currently, certain HP ProLiant platforms implement the PCC interface. However, any platform whose BIOS implements the PCC Specification, can utilize this driver. V2 --> V1 changes (based on Dominik's suggestions): - Removed the dependency on CPU_FREQ_TABLE - "cpufreq_stats" will no longer PANIC. Actually, it will not load anymore because it is not applicable. - Removed the sanity check for target frequency in the ->target routine. NOTE: A patch to sanitize the target frequency requested by "ondemand" is needed to ensure that the target freq < policy->min. Can this driver be queued up for the 2.6.33 tree? Signed-off-by: Naga Chumbalkar Signed-off-by: Matthew Garrett Signed-off-by: Thomas Renninger Signed-off-by: Dave Jones --- Documentation/cpu-freq/pcc-cpufreq.txt | 207 ++++++++++ arch/x86/kernel/cpu/cpufreq/Kconfig | 14 + arch/x86/kernel/cpu/cpufreq/Makefile | 1 + arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 621 ++++++++++++++++++++++++++++++ drivers/acpi/processor_core.c | 2 + 5 files changed, 845 insertions(+) create mode 100644 Documentation/cpu-freq/pcc-cpufreq.txt create mode 100644 arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c (limited to 'arch/x86/kernel') diff --git a/Documentation/cpu-freq/pcc-cpufreq.txt b/Documentation/cpu-freq/pcc-cpufreq.txt new file mode 100644 index 00000000000..9e3c3b33514 --- /dev/null +++ b/Documentation/cpu-freq/pcc-cpufreq.txt @@ -0,0 +1,207 @@ +/* + * pcc-cpufreq.txt - PCC interface documentation + * + * Copyright (C) 2009 Red Hat, Matthew Garrett + * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. + * Nagananda Chumbalkar + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON + * INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + + + Processor Clocking Control Driver + --------------------------------- + +Contents: +--------- +1. Introduction +1.1 PCC interface +1.1.1 Get Average Frequency +1.1.2 Set Desired Frequency +1.2 Platforms affected +2. Driver and /sys details +2.1 scaling_available_frequencies +2.2 cpuinfo_transition_latency +2.3 cpuinfo_cur_freq +2.4 related_cpus +3. Caveats + +1. Introduction: +---------------- +Processor Clocking Control (PCC) is an interface between the platform +firmware and OSPM. It is a mechanism for coordinating processor +performance (ie: frequency) between the platform firmware and the OS. + +The PCC driver (pcc-cpufreq) allows OSPM to take advantage of the PCC +interface. + +OS utilizes the PCC interface to inform platform firmware what frequency the +OS wants for a logical processor. The platform firmware attempts to achieve +the requested frequency. If the request for the target frequency could not be +satisfied by platform firmware, then it usually means that power budget +conditions are in place, and "power capping" is taking place. + +1.1 PCC interface: +------------------ +The complete PCC specification is available here: +http://www.acpica.org/download/Processor-Clocking-Control-v1p0.pdf + +PCC relies on a shared memory region that provides a channel for communication +between the OS and platform firmware. PCC also implements a "doorbell" that +is used by the OS to inform the platform firmware that a command has been +sent. + +The ACPI PCCH() method is used to discover the location of the PCC shared +memory region. The shared memory region header contains the "command" and +"status" interface. PCCH() also contains details on how to access the platform +doorbell. + +The following commands are supported by the PCC interface: +* Get Average Frequency +* Set Desired Frequency + +The ACPI PCCP() method is implemented for each logical processor and is +used to discover the offsets for the input and output buffers in the shared +memory region. + +When PCC mode is enabled, the platform will not expose processor performance +or throttle states (_PSS, _TSS and related ACPI objects) to OSPM. Therefore, +the native P-state driver (such as acpi-cpufreq for Intel, powernow-k8 for +AMD) will not load. + +However, OSPM remains in control of policy. The governor (eg: "ondemand") +computes the required performance for each processor based on server workload. +The PCC driver fills in the command interface, and the input buffer and +communicates the request to the platform firmware. The platform firmware is +responsible for delivering the requested performance. + +Each PCC command is "global" in scope and can affect all the logical CPUs in +the system. Therefore, PCC is capable of performing "group" updates. With PCC +the OS is capable of getting/setting the frequency of all the logical CPUs in +the system with a single call to the BIOS. + +1.1.1 Get Average Frequency: +---------------------------- +This command is used by the OSPM to query the running frequency of the +processor since the last time this command was completed. The output buffer +indicates the average unhalted frequency of the logical processor expressed as +a percentage of the nominal (ie: maximum) CPU frequency. The output buffer +also signifies if the CPU frequency is limited by a power budget condition. + +1.1.2 Set Desired Frequency: +---------------------------- +This command is used by the OSPM to communicate to the platform firmware the +desired frequency for a logical processor. The output buffer is currently +ignored by OSPM. The next invocation of "Get Average Frequency" will inform +OSPM if the desired frequency was achieved or not. + +1.2 Platforms affected: +----------------------- +The PCC driver will load on any system where the platform firmware: +* supports the PCC interface, and the associated PCCH() and PCCP() methods +* assumes responsibility for managing the hardware clocking controls in order +to deliver the requested processor performance + +Currently, certain HP ProLiant platforms implement the PCC interface. On those +platforms PCC is the "default" choice. + +However, it is possible to disable this interface via a BIOS setting. In +such an instance, as is also the case on platforms where the PCC interface +is not implemented, the PCC driver will fail to load silently. + +2. Driver and /sys details: +--------------------------- +When the driver loads, it merely prints the lowest and the highest CPU +frequencies supported by the platform firmware. + +The PCC driver loads with a message such as: +pcc-cpufreq: (v1.00.00) driver loaded with frequency limits: 1600 MHz, 2933 +MHz + +This means that the OPSM can request the CPU to run at any frequency in +between the limits (1600 MHz, and 2933 MHz) specified in the message. + +Internally, there is no need for the driver to convert the "target" frequency +to a corresponding P-state. + +The VERSION number for the driver will be of the format v.xy.ab. +eg: 1.00.02 + ----- -- + | | + | -- this will increase with bug fixes/enhancements to the driver + |-- this is the version of the PCC specification the driver adheres to + + +The following is a brief discussion on some of the fields exported via the +/sys filesystem and how their values are affected by the PCC driver: + +2.1 scaling_available_frequencies: +---------------------------------- +scaling_available_frequencies is not created in /sys. No intermediate +frequencies need to be listed because the BIOS will try to achieve any +frequency, within limits, requested by the governor. A frequency does not have +to be strictly associated with a P-state. + +2.2 cpuinfo_transition_latency: +------------------------------- +The cpuinfo_transition_latency field is 0. The PCC specification does +not include a field to expose this value currently. + +2.3 cpuinfo_cur_freq: +--------------------- +A) Often cpuinfo_cur_freq will show a value different than what is declared +in the scaling_available_frequencies or scaling_cur_freq, or scaling_max_freq. +This is due to "turbo boost" available on recent Intel processors. If certain +conditions are met the BIOS can achieve a slightly higher speed than requested +by OSPM. An example: + +scaling_cur_freq : 2933000 +cpuinfo_cur_freq : 3196000 + +B) There is a round-off error associated with the cpuinfo_cur_freq value. +Since the driver obtains the current frequency as a "percentage" (%) of the +nominal frequency from the BIOS, sometimes, the values displayed by +scaling_cur_freq and cpuinfo_cur_freq may not match. An example: + +scaling_cur_freq : 1600000 +cpuinfo_cur_freq : 1583000 + +In this example, the nominal frequency is 2933 MHz. The driver obtains the +current frequency, cpuinfo_cur_freq, as 54% of the nominal frequency: + + 54% of 2933 MHz = 1583 MHz + +Nominal frequency is the maximum frequency of the processor, and it usually +corresponds to the frequency of the P0 P-state. + +2.4 related_cpus: +----------------- +The related_cpus field is identical to affected_cpus. + +affected_cpus : 4 +related_cpus : 4 + +Currently, the PCC driver does not evaluate _PSD. The platforms that support +PCC do not implement SW_ALL. So OSPM doesn't need to perform any coordination +to ensure that the same frequency is requested of all dependent CPUs. + +3. Caveats: +----------- +The "cpufreq_stats" module in its present form cannot be loaded and +expected to work with the PCC driver. Since the "cpufreq_stats" module +provides information wrt each P-state, it is not applicable to the PCC driver. diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index f138c6c389b..870e6cc6ad2 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -10,6 +10,20 @@ if CPU_FREQ comment "CPUFreq processor drivers" +config X86_PCC_CPUFREQ + tristate "Processor Clocking Control interface driver" + depends on ACPI && ACPI_PROCESSOR + help + This driver adds support for the PCC interface. + + For details, take a look at: + . + + To compile this driver as a module, choose M here: the + module will be called pcc-cpufreq. + + If in doubt, say N. + config X86_ACPI_CPUFREQ tristate "ACPI Processor P-States driver" select CPU_FREQ_TABLE diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile index 509296df294..1840c0a5170 100644 --- a/arch/x86/kernel/cpu/cpufreq/Makefile +++ b/arch/x86/kernel/cpu/cpufreq/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o +obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o obj-$(CONFIG_X86_LONGHAUL) += longhaul.o diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c new file mode 100644 index 00000000000..29368854533 --- /dev/null +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -0,0 +1,621 @@ +/* + * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface + * + * Copyright (C) 2009 Red Hat, Matthew Garrett + * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. + * Nagananda Chumbalkar + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON + * INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#define PCC_VERSION "1.00.00" +#define POLL_LOOPS 300 + +#define CMD_COMPLETE 0x1 +#define CMD_GET_FREQ 0x0 +#define CMD_SET_FREQ 0x1 + +#define BUF_SZ 4 + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ + "pcc-cpufreq", msg) + +struct pcc_register_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 bit_width; + u8 bit_offset; + u8 access_size; + u64 address; +} __attribute__ ((packed)); + +struct pcc_memory_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 resource_usage; + u8 type_specific; + u64 granularity; + u64 minimum; + u64 maximum; + u64 translation_offset; + u64 address_length; +} __attribute__ ((packed)); + +static struct cpufreq_driver pcc_cpufreq_driver; + +struct pcc_header { + u32 signature; + u16 length; + u8 major; + u8 minor; + u32 features; + u16 command; + u16 status; + u32 latency; + u32 minimum_time; + u32 maximum_time; + u32 nominal; + u32 throttled_frequency; + u32 minimum_frequency; +}; + +static void __iomem *pcch_virt_addr; +static struct pcc_header __iomem *pcch_hdr; + +static DEFINE_SPINLOCK(pcc_lock); + +static struct acpi_generic_address doorbell; + +static u64 doorbell_preserve; +static u64 doorbell_write; + +static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f, + 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46}; + +struct pcc_cpu { + u32 input_offset; + u32 output_offset; +}; + +static struct pcc_cpu *pcc_cpu_info; + +static int pcc_cpufreq_verify(struct cpufreq_policy *policy) +{ + cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq, + policy->cpuinfo.max_freq); + return 0; +} + +static inline void pcc_cmd(void) +{ + u64 doorbell_value; + int i; + + acpi_read(&doorbell_value, &doorbell); + acpi_write((doorbell_value & doorbell_preserve) | doorbell_write, + &doorbell); + + for (i = 0; i < POLL_LOOPS; i++) { + if (ioread16(&pcch_hdr->status) & CMD_COMPLETE) + break; + } +} + +static inline void pcc_clear_mapping(void) +{ + if (pcch_virt_addr) + iounmap(pcch_virt_addr); + pcch_virt_addr = NULL; +} + +static unsigned int pcc_get_freq(unsigned int cpu) +{ + struct pcc_cpu *pcc_cpu_data; + unsigned int curr_freq; + unsigned int freq_limit; + u16 status; + u32 input_buffer; + u32 output_buffer; + + spin_lock(&pcc_lock); + + dprintk("get: get_freq for CPU %d\n", cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + input_buffer = 0x1; + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_GET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + output_buffer = + ioread32(pcch_virt_addr + pcc_cpu_data->output_offset); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + dprintk("get: FAILED: for CPU %d, status is %d\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff)) + / 100) * 1000); + + dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is " + "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n", + cpu, (pcch_virt_addr + pcc_cpu_data->output_offset), + output_buffer, curr_freq); + + freq_limit = (output_buffer >> 8) & 0xff; + if (freq_limit != 0xff) { + dprintk("get: frequency for cpu %d is being temporarily" + " capped at %d\n", cpu, curr_freq); + } + + spin_unlock(&pcc_lock); + return curr_freq; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return -EINVAL; +} + +static int pcc_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct pcc_cpu *pcc_cpu_data; + struct cpufreq_freqs freqs; + u16 status; + u32 input_buffer; + int cpu; + + spin_lock(&pcc_lock); + cpu = policy->cpu; + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + dprintk("target: CPU %d should go to target freq: %d " + "(virtual) input_offset is 0x%x\n", + cpu, target_freq, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + + freqs.new = target_freq; + freqs.cpu = cpu; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + input_buffer = 0x1 | (((target_freq * 100) + / (ioread32(&pcch_hdr->nominal) * 1000)) << 8); + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_SET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + dprintk("target: FAILED for cpu %d, with status: 0x%x\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + dprintk("target: was SUCCESSFUL for cpu %d\n", cpu); + spin_unlock(&pcc_lock); + + return 0; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return -EINVAL; +} + +static int pcc_get_offset(int cpu) +{ + acpi_status status; + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *pccp, *offset; + struct pcc_cpu *pcc_cpu_data; + struct acpi_processor *pr; + int ret = 0; + + pr = per_cpu(processors, cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; + + pccp = buffer.pointer; + if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + }; + + offset = &(pccp->package.elements[0]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->input_offset = offset->integer.value; + + offset = &(pccp->package.elements[1]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->output_offset = offset->integer.value; + + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ); + + dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data " + "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n", + cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset); +out_free: + kfree(buffer.pointer); + return ret; +} + +static int __init pcc_cpufreq_do_osc(acpi_handle *handle) +{ + acpi_status status; + struct acpi_object_list input; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object in_params[4]; + union acpi_object *out_obj; + u32 capabilities[2]; + u32 errors; + u32 supported; + int ret = 0; + + input.count = 4; + input.pointer = in_params; + input.count = 4; + input.pointer = in_params; + in_params[0].type = ACPI_TYPE_BUFFER; + in_params[0].buffer.length = 16; + in_params[0].buffer.pointer = OSC_UUID; + in_params[1].type = ACPI_TYPE_INTEGER; + in_params[1].integer.value = 1; + in_params[2].type = ACPI_TYPE_INTEGER; + in_params[2].integer.value = 2; + in_params[3].type = ACPI_TYPE_BUFFER; + in_params[3].buffer.length = 8; + in_params[3].buffer.pointer = (u8 *)&capabilities; + + capabilities[0] = OSC_QUERY_ENABLE; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + + kfree(output.pointer); + capabilities[0] = 0x0; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + +out_free: + kfree(output.pointer); + return ret; +} + +static int __init pcc_cpufreq_probe(void) +{ + acpi_status status; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + struct pcc_memory_resource *mem_resource; + struct pcc_register_resource *reg_resource; + union acpi_object *out_obj, *member; + acpi_handle handle, osc_handle; + int ret = 0; + + status = acpi_get_handle(NULL, "\\_SB", &handle); + if (ACPI_FAILURE(status)) + return -ENODEV; + + status = acpi_get_handle(handle, "_OSC", &osc_handle); + if (ACPI_SUCCESS(status)) { + ret = pcc_cpufreq_do_osc(&osc_handle); + if (ret) + dprintk("probe: _OSC evaluation did not succeed\n"); + /* Firmware's use of _OSC is optional */ + ret = 0; + } + + status = acpi_evaluate_object(handle, "PCCH", NULL, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + } + + member = &out_obj->package.elements[0]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + mem_resource = (struct pcc_memory_resource *)member->buffer.pointer; + + dprintk("probe: mem_resource descriptor: 0x%x," + " length: %d, space_id: %d, resource_usage: %d," + " type_specific: %d, granularity: 0x%llx," + " minimum: 0x%llx, maximum: 0x%llx," + " translation_offset: 0x%llx, address_length: 0x%llx\n", + mem_resource->descriptor, mem_resource->length, + mem_resource->space_id, mem_resource->resource_usage, + mem_resource->type_specific, mem_resource->granularity, + mem_resource->minimum, mem_resource->maximum, + mem_resource->translation_offset, + mem_resource->address_length); + + if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) { + ret = -ENODEV; + goto out_free; + } + + pcch_virt_addr = ioremap_nocache(mem_resource->minimum, + mem_resource->address_length); + if (pcch_virt_addr == NULL) { + dprintk("probe: could not map shared mem region\n"); + goto out_free; + } + pcch_hdr = pcch_virt_addr; + + dprintk("probe: PCCH header (virtual) addr: 0x%llx\n", + (u64)pcch_hdr); + dprintk("probe: PCCH header is at physical address: 0x%llx," + " signature: 0x%x, length: %d bytes, major: %d, minor: %d," + " supported features: 0x%x, command field: 0x%x," + " status field: 0x%x, nominal latency: %d us\n", + mem_resource->minimum, ioread32(&pcch_hdr->signature), + ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major), + ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features), + ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status), + ioread32(&pcch_hdr->latency)); + + dprintk("probe: min time between commands: %d us," + " max time between commands: %d us," + " nominal CPU frequency: %d MHz," + " minimum CPU frequency: %d MHz," + " minimum CPU frequency without throttling: %d MHz\n", + ioread32(&pcch_hdr->minimum_time), + ioread32(&pcch_hdr->maximum_time), + ioread32(&pcch_hdr->nominal), + ioread32(&pcch_hdr->throttled_frequency), + ioread32(&pcch_hdr->minimum_frequency)); + + member = &out_obj->package.elements[1]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto pcch_free; + } + + reg_resource = (struct pcc_register_resource *)member->buffer.pointer; + + doorbell.space_id = reg_resource->space_id; + doorbell.bit_width = reg_resource->bit_width; + doorbell.bit_offset = reg_resource->bit_offset; + doorbell.access_width = 64; + doorbell.address = reg_resource->address; + + dprintk("probe: doorbell: space_id is %d, bit_width is %d, " + "bit_offset is %d, access_width is %d, address is 0x%llx\n", + doorbell.space_id, doorbell.bit_width, doorbell.bit_offset, + doorbell.access_width, reg_resource->address); + + member = &out_obj->package.elements[2]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_preserve = member->integer.value; + + member = &out_obj->package.elements[3]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_write = member->integer.value; + + dprintk("probe: doorbell_preserve: 0x%llx," + " doorbell_write: 0x%llx\n", + doorbell_preserve, doorbell_write); + + pcc_cpu_info = alloc_percpu(struct pcc_cpu); + if (!pcc_cpu_info) { + ret = -ENOMEM; + goto pcch_free; + } + + printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency" + " limits: %d MHz, %d MHz\n", PCC_VERSION, + ioread32(&pcch_hdr->minimum_frequency), + ioread32(&pcch_hdr->nominal)); + kfree(output.pointer); + return ret; +pcch_free: + pcc_clear_mapping(); +out_free: + kfree(output.pointer); + return ret; +} + +static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + unsigned int result = 0; + + if (!pcch_virt_addr) { + result = -1; + goto pcch_null; + } + + result = pcc_get_offset(cpu); + if (result) { + dprintk("init: PCCP evaluation failed\n"); + goto free; + } + + policy->max = policy->cpuinfo.max_freq = + ioread32(&pcch_hdr->nominal) * 1000; + policy->min = policy->cpuinfo.min_freq = + ioread32(&pcch_hdr->minimum_frequency) * 1000; + policy->cur = pcc_get_freq(cpu); + + dprintk("init: policy->max is %d, policy->min is %d\n", + policy->max, policy->min); + + return 0; +free: + pcc_clear_mapping(); + free_percpu(pcc_cpu_info); +pcch_null: + return result; +} + +static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ + return 0; +} + +static struct cpufreq_driver pcc_cpufreq_driver = { + .flags = CPUFREQ_CONST_LOOPS, + .get = pcc_get_freq, + .verify = pcc_cpufreq_verify, + .target = pcc_cpufreq_target, + .init = pcc_cpufreq_cpu_init, + .exit = pcc_cpufreq_cpu_exit, + .name = "pcc-cpufreq", + .owner = THIS_MODULE, +}; + +static int __init pcc_cpufreq_init(void) +{ + int ret; + + if (acpi_disabled) + return 0; + + ret = pcc_cpufreq_probe(); + if (ret) { + dprintk("pcc_cpufreq_init: PCCH evaluation failed\n"); + return ret; + } + + ret = cpufreq_register_driver(&pcc_cpufreq_driver); + + return ret; +} + +static void __exit pcc_cpufreq_exit(void) +{ + cpufreq_unregister_driver(&pcc_cpufreq_driver); + + pcc_clear_mapping(); + + free_percpu(pcc_cpu_info); +} + +MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar"); +MODULE_VERSION(PCC_VERSION); +MODULE_DESCRIPTION("Processor Clocking Control interface driver"); +MODULE_LICENSE("GPL"); + +late_initcall(pcc_cpufreq_init); +module_exit(pcc_cpufreq_exit); diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c index 41731236f9a..7fe413cc7d9 100644 --- a/drivers/acpi/processor_core.c +++ b/drivers/acpi/processor_core.c @@ -123,6 +123,8 @@ static const struct file_operations acpi_processor_info_fops = { #endif DEFINE_PER_CPU(struct acpi_processor *, processors); +EXPORT_PER_CPU_SYMBOL(processors); + struct acpi_processor_errata errata __read_mostly; static int set_no_mwait(const struct dmi_system_id *id) { -- cgit v1.2.3-70-g09d2 From fb4635932a4e19c2f55383f968a0e9b64da37354 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 7 Jan 2010 15:26:22 -0500 Subject: [CPUFREQ] Fix cast warning in pcc driver. arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c:458: warning: cast from pointer to integer of different size Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index 29368854533..ff36d2979a9 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -455,8 +455,7 @@ static int __init pcc_cpufreq_probe(void) } pcch_hdr = pcch_virt_addr; - dprintk("probe: PCCH header (virtual) addr: 0x%llx\n", - (u64)pcch_hdr); + dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr); dprintk("probe: PCCH header is at physical address: 0x%llx," " signature: 0x%x, length: %d bytes, major: %d, minor: %d," " supported features: 0x%x, command field: 0x%x," -- cgit v1.2.3-70-g09d2 From 3bef444797f7624f8fbd27f4e0334ce96a108725 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 13 Jan 2010 10:45:55 -0500 Subject: x86: Merge show_regs() Using kernel_stack_pointer() allows 32-bit and 64-bit versions to be merged. This is more correct for 64-bit, since the old %rsp is always saved on the stack. Signed-off-by: Brian Gerst LKML-Reference: <1263397555-27695-1-git-send-email-brgerst@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/process.c | 7 +++++++ arch/x86/kernel/process_32.c | 6 ------ arch/x86/kernel/process_64.c | 6 ------ 3 files changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 98c2cdeb599..cf1e04b2ad6 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -92,6 +92,13 @@ void exit_thread(void) } } +void show_regs(struct pt_regs *regs) +{ + show_registers(regs); + show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), + regs->bp); +} + void show_regs_common(void) { const char *board, *product; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9c517b5858f..fe6a34e42bd 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -174,12 +174,6 @@ void __show_regs(struct pt_regs *regs, int all) d6, d7); } -void show_regs(struct pt_regs *regs) -{ - show_registers(regs); - show_trace(NULL, regs, ®s->sp, regs->bp); -} - void release_thread(struct task_struct *dead_task) { BUG_ON(dead_task->mm); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 52fbd0c6019..418f860880a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -211,12 +211,6 @@ void __show_regs(struct pt_regs *regs, int all) printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } -void show_regs(struct pt_regs *regs) -{ - show_registers(regs); - show_trace(NULL, regs, (void *)(regs + 1), regs->bp); -} - void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { -- cgit v1.2.3-70-g09d2 From 00097c4fdf117d9845d772f571a987ae95523f8c Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Sun, 17 Jan 2010 19:44:44 -0200 Subject: x86, trivial: Fix grammo in tsc comment about Geode TSC reliability Signed-off-by: Thadeu Lima de Souza Cascardo Cc: marcelo@kvack.org Cc: dilinger@collabora.co.uk Cc: trivial@kernel.org LKML-Reference: <1263764685-9871-1-git-send-email-cascardo@holoscopio.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 597683aa5ba..23066ecf12f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -806,7 +806,7 @@ static void __init check_system_tsc_reliable(void) unsigned long res_low, res_high; rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); - /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */ + /* Geode_LX - the OLPC CPU has a very reliable TSC */ if (res_low & RTSC_SUSP) tsc_clocksource_reliable = 1; #endif -- cgit v1.2.3-70-g09d2 From 722b3654852e48b93367a63f8ada9ee1cd43f2d3 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 13 Jan 2010 16:19:10 -0800 Subject: x86, vmi: Fix vmi_get_timer_vector() to use IRQ0_VECTOR FIRST_DEVICE_VECTOR is going away and it looks like a bad hack to steal FIRST_DEVICE_VECTOR / FIRST_EXTERNAL_VECTOR, when it looks like it needs IRQ0_VECTOR. Fix vmi_get_timer_vector() to use IRQ0_VECTOR. Signed-off-by: Suresh Siddha LKML-Reference: <20100114002118.436172066@sbs-t61.sc.intel.com> Cc: Alok N Kataria Cc: Zach Amsden Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmiclock_32.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 74c92bb194d..1268d993e9c 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -79,11 +79,7 @@ unsigned long vmi_tsc_khz(void) static inline unsigned int vmi_get_timer_vector(void) { -#ifdef CONFIG_X86_IO_APIC - return FIRST_DEVICE_VECTOR; -#else - return FIRST_EXTERNAL_VECTOR; -#endif + return IRQ0_VECTOR; } /** vmi clockchip */ @@ -239,8 +235,6 @@ void __init vmi_time_init(void) vmi_time_init_clockevent(); setup_irq(0, &vmi_clock_action); - for_each_possible_cpu(cpu) - per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0; } #ifdef CONFIG_X86_LOCAL_APIC -- cgit v1.2.3-70-g09d2 From 6579b474572fd54c583ac074e8e7aaae926c62ef Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 13 Jan 2010 16:19:11 -0800 Subject: x86, irq: Use 0x20 for the IRQ_MOVE_CLEANUP_VECTOR instead of 0x1f After talking to some more folks inside intel (Peter Anvin, Asit Mallick), the safest option (for future compatibility etc) seen was to use vector 0x20 for IRQ_MOVE_CLEANUP_VECTOR instead of using vector 0x1f (which is documented as reserved vector in the Intel IA32 manuals). Also we don't need to reserve the entire privilege level (all 16 vectors in the priority bucket that IRQ_MOVE_CLEANUP_VECTOR falls into), as the x86 architecture (section 10.9.3 in SDM Vol3a) specifies that with in the priority level, the higher the vector number the higher the priority. And hence we don't need to reserve the complete priority level 0x20-0x2f for the IRQ migration cleanup logic. So change the IRQ_MOVE_CLEANUP_VECTOR to 0x20 and allow 0x21-0x2f to be used for device interrupts. 0x30-0x3f will be used for ISA interrupts (these also can be migrated in the context of IOAPIC and hence need to be at a higher priority level than IRQ_MOVE_CLEANUP_VECTOR). Signed-off-by: Suresh Siddha LKML-Reference: <20100114002118.521826763@sbs-t61.sc.intel.com> Cc: Yinghai Lu Cc: Eric W. Biederman Cc: Maciej W. Rozycki Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/irq_vectors.h | 47 +++++++++++++------------------------- arch/x86/kernel/apic/io_apic.c | 4 ++-- 2 files changed, 18 insertions(+), 33 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 585a42810cf..8767d99c4f6 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -28,19 +28,22 @@ #define MCE_VECTOR 0x12 /* - * IDT vectors usable for external interrupt sources start - * at 0x20: - * hpa said we can start from 0x1f. - * 0x1f is documented as reserved. However, the ability for the APIC - * to generate vectors starting at 0x10 is documented, as is the - * ability for the CPU to receive any vector number as an interrupt. - * 0x1f is used for IRQ_MOVE_CLEANUP_VECTOR since that vector needs - * an entire privilege level (16 vectors) all by itself at a higher - * priority than any actual device vector. Thus, by placing it in the - * otherwise-unusable 0x10 privilege level, we avoid wasting a full - * 16-vector block. + * IDT vectors usable for external interrupt sources start at 0x20. + * (0x80 is the syscall vector, 0x30-0x3f are for ISA) */ -#define FIRST_EXTERNAL_VECTOR 0x1f +#define FIRST_EXTERNAL_VECTOR 0x20 +/* + * We start allocating at 0x21 to spread out vectors evenly between + * priority levels. (0x80 is the syscall vector) + */ +#define VECTOR_OFFSET_START 1 + +/* + * Reserve the lowest usable vector (and hence lowest priority) 0x20 for + * triggering cleanup after irq migration. 0x21-0x2f will still be used + * for device interrupts. + */ +#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR #define IA32_SYSCALL_VECTOR 0x80 #ifdef CONFIG_X86_32 @@ -48,17 +51,7 @@ #endif /* - * Reserve the lowest usable priority level 0x10 - 0x1f for triggering - * cleanup after irq migration. - * this overlaps with the reserved range for cpu exceptions so this - * will need to be changed to 0x20 - 0x2f if the last cpu exception is - * ever allocated. - */ - -#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR - -/* - * Vectors 0x20-0x2f are used for ISA interrupts. + * Vectors 0x30-0x3f are used for ISA interrupts. * round up to the next 16-vector boundary */ #define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15) @@ -132,14 +125,6 @@ */ #define MCE_SELF_VECTOR 0xeb -/* - * First APIC vector available to drivers: (vectors 0x30-0xee). We - * start allocating at 0x31 to spread out vectors evenly between - * priority levels. (0x80 is the syscall vector) - */ -#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 1) -#define VECTOR_OFFSET_START 1 - #define NR_VECTORS 256 #define FPU_IRQ 13 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index e9ba0903e9d..409f4943dc1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1162,7 +1162,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - static int current_vector = FIRST_DEVICE_VECTOR + VECTOR_OFFSET_START; + static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START; static int current_offset = VECTOR_OFFSET_START % 8; unsigned int old_vector; int cpu, err; @@ -1199,7 +1199,7 @@ next: if (vector >= first_system_vector) { /* If out of vectors on large boxen, must share them. */ offset = (offset + 1) % 8; - vector = FIRST_DEVICE_VECTOR + offset; + vector = FIRST_EXTERNAL_VECTOR + offset; } if (unlikely(current_vector == vector)) continue; -- cgit v1.2.3-70-g09d2 From 97943390b043bcafca69f9163b86bbf627b75589 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 19 Jan 2010 12:20:54 -0800 Subject: x86, irq: Don't block IRQ0_VECTOR..IRQ15_VECTOR's on all cpu's Currently IRQ0..IRQ15 are assigned to IRQ0_VECTOR..IRQ15_VECTOR's on all the cpu's. If these IRQ's are handled by legacy pic controller, then the kernel handles them only on cpu 0. So there is no need to block this vector space on all cpu's. Similarly if these IRQ's are handled by IO-APIC, then the IRQ affinity will determine on which cpu's we need allocate the vector resource for that particular IRQ. This can be done dynamically and here also there is no need to block 16 vectors for IRQ0..IRQ15 on all cpu's. Fix this by initially assigning IRQ0..IRQ15 to IRQ0_VECTOR..IRQ15_VECTOR's only on cpu 0. If the legacy controllers like pic handles these irq's, then this configuration will be fixed. If more modern controllers like IO-APIC handle these IRQ's, then we start with this configuration and as IRQ's migrate, vectors (/and cpu's) associated with these IRQ's change dynamically. This will freeup the block of 16 vectors on other cpu's which don't handle IRQ0..IRQ15, which can now be used for other IRQ's that the particular cpu handle. [ hpa: this also an architectural cleanup for future legacy-PIC-free configurations. ] [ hpa: fixed typo NR_LEGACY_IRQS -> NR_IRQS_LEGACY ] Signed-off-by: Suresh Siddha LKML-Reference: <1263932453.2814.52.camel@sbs-t61.sc.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/irq.h | 1 + arch/x86/kernel/apic/io_apic.c | 33 ++++++++++----------------------- arch/x86/kernel/irqinit.c | 35 +++++++++++++++++------------------ arch/x86/kernel/vmiclock_32.c | 2 ++ 4 files changed, 30 insertions(+), 41 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 5458380b6ef..262292729fc 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -48,5 +48,6 @@ extern DECLARE_BITMAP(used_vectors, NR_VECTORS); extern int vector_used_by_percpu_irq(unsigned int vector); extern void init_ISA_irqs(void); +extern int nr_legacy_irqs; #endif /* _ASM_X86_IRQ_H */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 409f4943dc1..1a30587a6bc 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -94,8 +94,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; -/* Number of legacy interrupts */ -static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; /* GSI interrupts */ static int nr_irqs_gsi = NR_IRQS_LEGACY; @@ -140,27 +138,10 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ #ifdef CONFIG_SPARSE_IRQ -static struct irq_cfg irq_cfgx[] = { +static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; #else -static struct irq_cfg irq_cfgx[NR_IRQS] = { +static struct irq_cfg irq_cfgx[NR_IRQS]; #endif - [0] = { .vector = IRQ0_VECTOR, }, - [1] = { .vector = IRQ1_VECTOR, }, - [2] = { .vector = IRQ2_VECTOR, }, - [3] = { .vector = IRQ3_VECTOR, }, - [4] = { .vector = IRQ4_VECTOR, }, - [5] = { .vector = IRQ5_VECTOR, }, - [6] = { .vector = IRQ6_VECTOR, }, - [7] = { .vector = IRQ7_VECTOR, }, - [8] = { .vector = IRQ8_VECTOR, }, - [9] = { .vector = IRQ9_VECTOR, }, - [10] = { .vector = IRQ10_VECTOR, }, - [11] = { .vector = IRQ11_VECTOR, }, - [12] = { .vector = IRQ12_VECTOR, }, - [13] = { .vector = IRQ13_VECTOR, }, - [14] = { .vector = IRQ14_VECTOR, }, - [15] = { .vector = IRQ15_VECTOR, }, -}; void __init io_apic_disable_legacy(void) { @@ -185,8 +166,14 @@ int __init arch_early_irq_init(void) desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); - if (i < nr_legacy_irqs) - cpumask_setall(cfg[i].domain); + /* + * For legacy IRQ's, start with assigning irq0 to irq15 to + * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. + */ + if (i < nr_legacy_irqs) { + cfg[i].vector = IRQ0_VECTOR + i; + cpumask_set_cpu(0, cfg[i].domain); + } } return 0; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index d5932226614..fce55d53263 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -84,24 +84,7 @@ static struct irqaction irq2 = { }; DEFINE_PER_CPU(vector_irq_t, vector_irq) = { - [0 ... IRQ0_VECTOR - 1] = -1, - [IRQ0_VECTOR] = 0, - [IRQ1_VECTOR] = 1, - [IRQ2_VECTOR] = 2, - [IRQ3_VECTOR] = 3, - [IRQ4_VECTOR] = 4, - [IRQ5_VECTOR] = 5, - [IRQ6_VECTOR] = 6, - [IRQ7_VECTOR] = 7, - [IRQ8_VECTOR] = 8, - [IRQ9_VECTOR] = 9, - [IRQ10_VECTOR] = 10, - [IRQ11_VECTOR] = 11, - [IRQ12_VECTOR] = 12, - [IRQ13_VECTOR] = 13, - [IRQ14_VECTOR] = 14, - [IRQ15_VECTOR] = 15, - [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 + [0 ... NR_VECTORS - 1] = -1, }; int vector_used_by_percpu_irq(unsigned int vector) @@ -116,6 +99,9 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } +/* Number of legacy interrupts */ +int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; + void __init init_ISA_irqs(void) { int i; @@ -142,6 +128,19 @@ void __init init_ISA_irqs(void) void __init init_IRQ(void) { + int i; + + /* + * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. + * If these IRQ's are handled by legacy interrupt-controllers like PIC, + * then this configuration will likely be static after the boot. If + * these IRQ's are handled by more mordern controllers like IO-APIC, + * then this vector space can be freed and re-used dynamically as the + * irq's migrate etc. + */ + for (i = 0; i < nr_legacy_irqs; i++) + per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; + x86_init.irqs.intr_init(); } diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 1268d993e9c..2f1ca561429 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -235,6 +235,8 @@ void __init vmi_time_init(void) vmi_time_init_clockevent(); setup_irq(0, &vmi_clock_action); + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0; } #ifdef CONFIG_X86_LOCAL_APIC -- cgit v1.2.3-70-g09d2 From dcf39daf3d6d97f8741e82f0b9fb7554704ed2d1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 22 Jan 2010 16:01:05 +0100 Subject: x86, cacheinfo: Fix disabling of L3 cache indices * Correct the masks used for writing the cache index disable indices. * Do not turn off L3 scrubber - it is not necessary. * Make sure wbinvd is executed on the same node where the L3 is. * Check for out-of-bounds values written to the registers. * Make show_cache_disable hex values unambiguous * Check for Erratum #388 Signed-off-by: Borislav Petkov LKML-Reference: <1264172467-25155-4-git-send-email-bp@amd64.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index fc6c8ef92dc..08c91abc4d3 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -18,6 +18,7 @@ #include #include #include +#include #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -299,8 +300,10 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) if (boot_cpu_data.x86 == 0x11) return; - /* see erratum #382 */ - if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) + /* see errata #382 and #388 */ + if ((boot_cpu_data.x86 == 0x10) && + ((boot_cpu_data.x86_model < 0x9) || + (boot_cpu_data.x86_mask < 0x1))) return; this_leaf->can_disable = 1; @@ -726,12 +729,12 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, return -EINVAL; pci_read_config_dword(dev, 0x1BC + index * 4, ®); - return sprintf(buf, "%x\n", reg); + return sprintf(buf, "0x%08x\n", reg); } #define SHOW_CACHE_DISABLE(index) \ static ssize_t \ -show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ { \ return show_cache_disable(this_leaf, buf, index); \ } @@ -745,7 +748,9 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, int node = cpu_to_node(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned long val = 0; - unsigned int scrubber = 0; + +#define SUBCACHE_MASK (3UL << 20) +#define SUBCACHE_INDEX 0xfff if (!this_leaf->can_disable) return -EINVAL; @@ -759,21 +764,24 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - val |= 0xc0000000; - - pci_read_config_dword(dev, 0x58, &scrubber); - scrubber &= ~0x1f000000; - pci_write_config_dword(dev, 0x58, scrubber); + /* do not allow writes outside of allowed bits */ + if (val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) + return -EINVAL; - pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); - wbinvd(); + val |= BIT(30); pci_write_config_dword(dev, 0x1BC + index * 4, val); + /* + * We need to WBINVD on a core on the node containing the L3 cache which + * indices we disable therefore a simple wbinvd() is not sufficient. + */ + wbinvd_on_cpu(cpu); + pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31)); return count; } #define STORE_CACHE_DISABLE(index) \ static ssize_t \ -store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ const char *buf, size_t count) \ { \ return store_cache_disable(this_leaf, buf, count, index); \ -- cgit v1.2.3-70-g09d2 From 897de50e08937663912c86fb12ad7f708af2386c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 22 Jan 2010 16:01:06 +0100 Subject: x86, cacheinfo: Add cache index disable sysfs attrs only to L3 caches The cache_disable_[01] attribute in /sys/devices/system/cpu/cpu?/cache/index[0-3]/ is enabled on all cache levels although only L3 supports it. Add it only to the cache level that actually supports it. Signed-off-by: Borislav Petkov LKML-Reference: <1264172467-25155-5-git-send-email-bp@amd64.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 08c91abc4d3..3976ce95095 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -814,16 +814,24 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, show_cache_disable_1, store_cache_disable_1); +#define DEFAULT_SYSFS_CACHE_ATTRS \ + &type.attr, \ + &level.attr, \ + &coherency_line_size.attr, \ + &physical_line_partition.attr, \ + &ways_of_associativity.attr, \ + &number_of_sets.attr, \ + &size.attr, \ + &shared_cpu_map.attr, \ + &shared_cpu_list.attr + static struct attribute *default_attrs[] = { - &type.attr, - &level.attr, - &coherency_line_size.attr, - &physical_line_partition.attr, - &ways_of_associativity.attr, - &number_of_sets.attr, - &size.attr, - &shared_cpu_map.attr, - &shared_cpu_list.attr, + DEFAULT_SYSFS_CACHE_ATTRS, + NULL +}; + +static struct attribute *default_l3_attrs[] = { + DEFAULT_SYSFS_CACHE_ATTRS, &cache_disable_0.attr, &cache_disable_1.attr, NULL @@ -916,6 +924,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) unsigned int cpu = sys_dev->id; unsigned long i, j; struct _index_kobject *this_object; + struct _cpuid4_info *this_leaf; int retval; retval = cpuid4_cache_sysfs_init(cpu); @@ -934,6 +943,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_object = INDEX_KOBJECT_PTR(cpu, i); this_object->cpu = cpu; this_object->index = i; + + this_leaf = CPUID4_INFO_IDX(cpu, i); + + if (this_leaf->can_disable) + ktype_cache.default_attrs = default_l3_attrs; + else + ktype_cache.default_attrs = default_attrs; + retval = kobject_init_and_add(&(this_object->kobj), &ktype_cache, per_cpu(ici_cache_kobject, cpu), -- cgit v1.2.3-70-g09d2 From 048a8774ca43488d78605031f11cc206d7a2682a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 22 Jan 2010 16:01:07 +0100 Subject: x86, cacheinfo: Calculate L3 indices We need to know the valid L3 indices interval when disabling them over /sysfs. Do that when the core is brought online and add boundary checks to the sysfs .store attribute. Signed-off-by: Borislav Petkov LKML-Reference: <1264172467-25155-6-git-send-email-bp@amd64.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 3976ce95095..589b705e80e 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -151,7 +151,8 @@ struct _cpuid4_info { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - unsigned long can_disable; + bool can_disable; + unsigned int l3_indices; DECLARE_BITMAP(shared_cpu_map, NR_CPUS); }; @@ -161,7 +162,8 @@ struct _cpuid4_info_regs { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - unsigned long can_disable; + bool can_disable; + unsigned int l3_indices; }; unsigned short num_cache_leaves; @@ -291,6 +293,29 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, (ebx->split.ways_of_associativity + 1) - 1; } +static unsigned int __cpuinit amd_calc_l3_indices(void) +{ + /* + * We're called over smp_call_function_single() and therefore + * are on the correct cpu. + */ + int cpu = smp_processor_id(); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int sc0, sc1, sc2, sc3; + u32 val; + + pci_read_config_dword(dev, 0x1C4, &val); + + /* calculate subcache sizes */ + sc0 = !(val & BIT(0)); + sc1 = !(val & BIT(4)); + sc2 = !(val & BIT(8)) + !(val & BIT(9)); + sc3 = !(val & BIT(12)) + !(val & BIT(13)); + + return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; +} + static void __cpuinit amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) { @@ -306,7 +331,8 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) (boot_cpu_data.x86_mask < 0x1))) return; - this_leaf->can_disable = 1; + this_leaf->can_disable = true; + this_leaf->l3_indices = amd_calc_l3_indices(); } static int @@ -765,7 +791,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, return -EINVAL; /* do not allow writes outside of allowed bits */ - if (val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) + if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || + ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) return -EINVAL; val |= BIT(30); -- cgit v1.2.3-70-g09d2 From 439913fffd39374c3737186b22d2d56c3a0ae526 Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Thu, 28 Jan 2010 10:53:19 +0800 Subject: ACPI: replace acpi_integer by u64 acpi_integer is now obsolete and removed from the ACPICA code base, replaced by u64. Signed-off-by: Lin Ming Signed-off-by: Len Brown --- arch/ia64/hp/common/aml_nfw.c | 6 ++--- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 6 ++--- drivers/acpi/battery.c | 4 +-- drivers/acpi/ec.c | 4 +-- drivers/acpi/glue.c | 4 +-- drivers/acpi/osl.c | 4 +-- drivers/acpi/power_meter.c | 30 +++++++++++----------- drivers/acpi/processor_idle.c | 2 +- drivers/acpi/processor_throttling.c | 24 +++++++++--------- drivers/acpi/utils.c | 16 ++++++------ drivers/acpi/video.c | 2 +- drivers/ata/libata-acpi.c | 4 +-- drivers/ide/ide-acpi.c | 8 +++--- drivers/input/misc/atlas_btns.c | 2 +- drivers/pci/pci-acpi.c | 2 +- drivers/platform/x86/toshiba_bluetooth.c | 4 +-- drivers/platform/x86/wmi.c | 4 +-- include/acpi/acpi_bus.h | 6 ++--- include/acpi/processor.h | 42 +++++++++++++++---------------- 19 files changed, 87 insertions(+), 87 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/ia64/hp/common/aml_nfw.c b/arch/ia64/hp/common/aml_nfw.c index 4abd2c79bb1..22078486d35 100644 --- a/arch/ia64/hp/common/aml_nfw.c +++ b/arch/ia64/hp/common/aml_nfw.c @@ -77,7 +77,7 @@ static void aml_nfw_execute(struct ia64_nfw_context *c) c->arg[4], c->arg[5], c->arg[6], c->arg[7]); } -static void aml_nfw_read_arg(u8 *offset, u32 bit_width, acpi_integer *value) +static void aml_nfw_read_arg(u8 *offset, u32 bit_width, u64 *value) { switch (bit_width) { case 8: @@ -95,7 +95,7 @@ static void aml_nfw_read_arg(u8 *offset, u32 bit_width, acpi_integer *value) } } -static void aml_nfw_write_arg(u8 *offset, u32 bit_width, acpi_integer *value) +static void aml_nfw_write_arg(u8 *offset, u32 bit_width, u64 *value) { switch (bit_width) { case 8: @@ -114,7 +114,7 @@ static void aml_nfw_write_arg(u8 *offset, u32 bit_width, acpi_integer *value) } static acpi_status aml_nfw_handler(u32 function, acpi_physical_address address, - u32 bit_width, acpi_integer *value, void *handler_context, + u32 bit_width, u64 *value, void *handler_context, void *region_context) { struct ia64_nfw_context *context = handler_context; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index f125e5c551c..55d42bc443e 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -806,7 +806,7 @@ static int find_psb_table(struct powernow_k8_data *data) static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { - acpi_integer control; + u64 control; if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) return; @@ -824,7 +824,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { struct cpufreq_frequency_table *powernow_table; int ret_val = -ENODEV; - acpi_integer control, status; + u64 control, status; if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { dprintk("register performance failed: bad ACPI data\n"); @@ -948,7 +948,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, u32 fid; u32 vid; u32 freq, index; - acpi_integer status, control; + u64 status, control; if (data->exttype) { status = data->acpi_data.states[i].status; diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index cada73ffdfa..58d2c91ba62 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -324,8 +324,8 @@ static int extract_package(struct acpi_battery *battery, strncpy(ptr, element->string.pointer, 32); else if (element->type == ACPI_TYPE_INTEGER) { strncpy(ptr, (u8 *)&element->integer.value, - sizeof(acpi_integer)); - ptr[sizeof(acpi_integer)] = 0; + sizeof(u64)); + ptr[sizeof(u64)] = 0; } else *ptr = 0; /* don't have value */ } else { diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index d6471bb6852..748ced85dec 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -589,7 +589,7 @@ static u32 acpi_ec_gpe_handler(void *data) static acpi_status acpi_ec_space_handler(u32 function, acpi_physical_address address, - u32 bits, acpi_integer *value, + u32 bits, u64 *value, void *handler_context, void *region_context) { struct acpi_ec *ec = handler_context; @@ -620,7 +620,7 @@ acpi_ec_space_handler(u32 function, acpi_physical_address address, ++address; if (function == ACPI_READ) { result = acpi_ec_read(ec, address, &temp); - (*value) |= ((acpi_integer)temp) << i; + (*value) |= ((u64)temp) << i; } else { temp = 0xff & ((*value) >> i); result = acpi_ec_write(ec, address, temp); diff --git a/drivers/acpi/glue.c b/drivers/acpi/glue.c index 4c8fcff662c..6d5b64b7d52 100644 --- a/drivers/acpi/glue.c +++ b/drivers/acpi/glue.c @@ -87,7 +87,7 @@ static int acpi_find_bridge_device(struct device *dev, acpi_handle * handle) /* Get device's handler per its address under its parent */ struct acpi_find_child { acpi_handle handle; - acpi_integer address; + u64 address; }; static acpi_status @@ -106,7 +106,7 @@ do_acpi_find_child(acpi_handle handle, u32 lvl, void *context, void **rv) return AE_OK; } -acpi_handle acpi_get_child(acpi_handle parent, acpi_integer address) +acpi_handle acpi_get_child(acpi_handle parent, u64 address) { struct acpi_find_child find = { NULL, address }; diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 02e8464e480..8e6d8665f0a 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -436,7 +436,7 @@ acpi_status acpi_os_remove_interrupt_handler(u32 irq, acpi_osd_handler handler) * Running in interpreter thread context, safe to sleep */ -void acpi_os_sleep(acpi_integer ms) +void acpi_os_sleep(u64 ms) { schedule_timeout_interruptible(msecs_to_jiffies(ms)); } @@ -603,7 +603,7 @@ acpi_os_read_pci_configuration(struct acpi_pci_id * pci_id, u32 reg, acpi_status acpi_os_write_pci_configuration(struct acpi_pci_id * pci_id, u32 reg, - acpi_integer value, u32 width) + u64 value, u32 width) { int result, size; diff --git a/drivers/acpi/power_meter.c b/drivers/acpi/power_meter.c index dc4ffadf812..834c5af0de4 100644 --- a/drivers/acpi/power_meter.c +++ b/drivers/acpi/power_meter.c @@ -71,17 +71,17 @@ static const struct acpi_device_id power_meter_ids[] = { MODULE_DEVICE_TABLE(acpi, power_meter_ids); struct acpi_power_meter_capabilities { - acpi_integer flags; - acpi_integer units; - acpi_integer type; - acpi_integer accuracy; - acpi_integer sampling_time; - acpi_integer min_avg_interval; - acpi_integer max_avg_interval; - acpi_integer hysteresis; - acpi_integer configurable_cap; - acpi_integer min_cap; - acpi_integer max_cap; + u64 flags; + u64 units; + u64 type; + u64 accuracy; + u64 sampling_time; + u64 min_avg_interval; + u64 max_avg_interval; + u64 hysteresis; + u64 configurable_cap; + u64 min_cap; + u64 max_cap; }; struct acpi_power_meter_resource { @@ -93,9 +93,9 @@ struct acpi_power_meter_resource { acpi_string model_number; acpi_string serial_number; acpi_string oem_info; - acpi_integer power; - acpi_integer cap; - acpi_integer avg_interval; + u64 power; + u64 cap; + u64 avg_interval; int sensors_valid; unsigned long sensors_last_updated; struct sensor_device_attribute sensors[NUM_SENSORS]; @@ -402,7 +402,7 @@ static ssize_t show_val(struct device *dev, struct sensor_device_attribute *attr = to_sensor_dev_attr(devattr); struct acpi_device *acpi_dev = to_acpi_device(dev); struct acpi_power_meter_resource *resource = acpi_dev->driver_data; - acpi_integer val = 0; + u64 val = 0; switch (attr->index) { case 0: diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 7c0441f63b3..3455d7abf5f 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -352,7 +352,7 @@ static int acpi_processor_get_power_info_default(struct acpi_processor *pr) static int acpi_processor_get_power_info_cst(struct acpi_processor *pr) { acpi_status status = 0; - acpi_integer count; + u64 count; int current_count; int i; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index 1c5d7a8b2fd..7ded7542fc9 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -660,7 +660,7 @@ static int acpi_processor_get_throttling_fadt(struct acpi_processor *pr) #ifdef CONFIG_X86 static int acpi_throttling_rdmsr(struct acpi_processor *pr, - acpi_integer * value) + u64 *value) { struct cpuinfo_x86 *c; u64 msr_high, msr_low; @@ -681,13 +681,13 @@ static int acpi_throttling_rdmsr(struct acpi_processor *pr, rdmsr_safe(MSR_IA32_THERM_CONTROL, (u32 *)&msr_low , (u32 *) &msr_high); msr = (msr_high << 32) | msr_low; - *value = (acpi_integer) msr; + *value = (u64) msr; ret = 0; } return ret; } -static int acpi_throttling_wrmsr(struct acpi_processor *pr, acpi_integer value) +static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value) { struct cpuinfo_x86 *c; unsigned int cpu; @@ -711,14 +711,14 @@ static int acpi_throttling_wrmsr(struct acpi_processor *pr, acpi_integer value) } #else static int acpi_throttling_rdmsr(struct acpi_processor *pr, - acpi_integer * value) + u64 *value) { printk(KERN_ERR PREFIX "HARDWARE addr space,NOT supported yet\n"); return -1; } -static int acpi_throttling_wrmsr(struct acpi_processor *pr, acpi_integer value) +static int acpi_throttling_wrmsr(struct acpi_processor *pr, u64 value) { printk(KERN_ERR PREFIX "HARDWARE addr space,NOT supported yet\n"); @@ -727,7 +727,7 @@ static int acpi_throttling_wrmsr(struct acpi_processor *pr, acpi_integer value) #endif static int acpi_read_throttling_status(struct acpi_processor *pr, - acpi_integer *value) + u64 *value) { u32 bit_width, bit_offset; u64 ptc_value; @@ -746,7 +746,7 @@ static int acpi_read_throttling_status(struct acpi_processor *pr, address, (u32 *) &ptc_value, (u32) (bit_width + bit_offset)); ptc_mask = (1 << bit_width) - 1; - *value = (acpi_integer) ((ptc_value >> bit_offset) & ptc_mask); + *value = (u64) ((ptc_value >> bit_offset) & ptc_mask); ret = 0; break; case ACPI_ADR_SPACE_FIXED_HARDWARE: @@ -760,7 +760,7 @@ static int acpi_read_throttling_status(struct acpi_processor *pr, } static int acpi_write_throttling_state(struct acpi_processor *pr, - acpi_integer value) + u64 value) { u32 bit_width, bit_offset; u64 ptc_value; @@ -793,7 +793,7 @@ static int acpi_write_throttling_state(struct acpi_processor *pr, } static int acpi_get_throttling_state(struct acpi_processor *pr, - acpi_integer value) + u64 value) { int i; @@ -808,7 +808,7 @@ static int acpi_get_throttling_state(struct acpi_processor *pr, } static int acpi_get_throttling_value(struct acpi_processor *pr, - int state, acpi_integer *value) + int state, u64 *value) { int ret = -1; @@ -826,7 +826,7 @@ static int acpi_processor_get_throttling_ptc(struct acpi_processor *pr) { int state = 0; int ret; - acpi_integer value; + u64 value; if (!pr) return -EINVAL; @@ -993,7 +993,7 @@ static int acpi_processor_set_throttling_ptc(struct acpi_processor *pr, int state, bool force) { int ret; - acpi_integer value; + u64 value; if (!pr) return -EINVAL; diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index 811fec10462..11882dbe209 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -107,12 +107,12 @@ acpi_extract_package(union acpi_object *package, case ACPI_TYPE_INTEGER: switch (format_string[i]) { case 'N': - size_required += sizeof(acpi_integer); - tail_offset += sizeof(acpi_integer); + size_required += sizeof(u64); + tail_offset += sizeof(u64); break; case 'S': size_required += - sizeof(char *) + sizeof(acpi_integer) + + sizeof(char *) + sizeof(u64) + sizeof(char); tail_offset += sizeof(char *); break; @@ -193,17 +193,17 @@ acpi_extract_package(union acpi_object *package, case ACPI_TYPE_INTEGER: switch (format_string[i]) { case 'N': - *((acpi_integer *) head) = + *((u64 *) head) = element->integer.value; - head += sizeof(acpi_integer); + head += sizeof(u64); break; case 'S': pointer = (u8 **) head; *pointer = tail; - *((acpi_integer *) tail) = + *((u64 *) tail) = element->integer.value; - head += sizeof(acpi_integer *); - tail += sizeof(acpi_integer); + head += sizeof(u64 *); + tail += sizeof(u64); /* NULL terminate string */ *tail = (char)0; tail += sizeof(char); diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index b765790b32b..6e9b49149fc 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -759,7 +759,7 @@ acpi_video_bus_POST_options(struct acpi_video_bus *video, static int acpi_video_bus_DOS(struct acpi_video_bus *video, int bios_flag, int lcd_flag) { - acpi_integer status = 0; + u64 status = 0; union acpi_object arg0 = { ACPI_TYPE_INTEGER }; struct acpi_object_list args = { 1, &arg0 }; diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c index 1245838ac13..292fdbc0431 100644 --- a/drivers/ata/libata-acpi.c +++ b/drivers/ata/libata-acpi.c @@ -64,7 +64,7 @@ void ata_acpi_associate_sata_port(struct ata_port *ap) WARN_ON(!(ap->flags & ATA_FLAG_ACPI_SATA)); if (!sata_pmp_attached(ap)) { - acpi_integer adr = SATA_ADR(ap->port_no, NO_PORT_MULT); + u64 adr = SATA_ADR(ap->port_no, NO_PORT_MULT); ap->link.device->acpi_handle = acpi_get_child(ap->host->acpi_handle, adr); @@ -74,7 +74,7 @@ void ata_acpi_associate_sata_port(struct ata_port *ap) ap->link.device->acpi_handle = NULL; ata_for_each_link(link, ap, EDGE) { - acpi_integer adr = SATA_ADR(ap->port_no, link->pmp); + u64 adr = SATA_ADR(ap->port_no, link->pmp); link->device->acpi_handle = acpi_get_child(ap->host->acpi_handle, adr); diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c index c0cf45a11b9..5cb01e5c323 100644 --- a/drivers/ide/ide-acpi.c +++ b/drivers/ide/ide-acpi.c @@ -108,11 +108,11 @@ bool ide_port_acpi(ide_hwif_t *hwif) * Returns 0 on success, <0 on error. */ static int ide_get_dev_handle(struct device *dev, acpi_handle *handle, - acpi_integer *pcidevfn) + u64 *pcidevfn) { struct pci_dev *pdev = to_pci_dev(dev); unsigned int bus, devnum, func; - acpi_integer addr; + u64 addr; acpi_handle dev_handle; acpi_status status; struct acpi_device_info *dinfo = NULL; @@ -122,7 +122,7 @@ static int ide_get_dev_handle(struct device *dev, acpi_handle *handle, devnum = PCI_SLOT(pdev->devfn); func = PCI_FUNC(pdev->devfn); /* ACPI _ADR encoding for PCI bus: */ - addr = (acpi_integer)(devnum << 16 | func); + addr = (u64)(devnum << 16 | func); DEBPRINT("ENTER: pci %02x:%02x.%01x\n", bus, devnum, func); @@ -169,7 +169,7 @@ static acpi_handle ide_acpi_hwif_get_handle(ide_hwif_t *hwif) { struct device *dev = hwif->gendev.parent; acpi_handle uninitialized_var(dev_handle); - acpi_integer pcidevfn; + u64 pcidevfn; acpi_handle chan_handle; int err; diff --git a/drivers/input/misc/atlas_btns.c b/drivers/input/misc/atlas_btns.c index 1b871917340..dfaa9a045ed 100644 --- a/drivers/input/misc/atlas_btns.c +++ b/drivers/input/misc/atlas_btns.c @@ -47,7 +47,7 @@ static acpi_status acpi_atlas_button_setup(acpi_handle region_handle, static acpi_status acpi_atlas_button_handler(u32 function, acpi_physical_address address, - u32 bit_width, acpi_integer *value, + u32 bit_width, u64 *value, void *handler_context, void *region_context) { acpi_status status; diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 7e2829538a4..441326c4413 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -143,7 +143,7 @@ static struct pci_platform_pm_ops acpi_pci_platform_pm = { static int acpi_pci_find_device(struct device *dev, acpi_handle *handle) { struct pci_dev * pci_dev; - acpi_integer addr; + u64 addr; pci_dev = to_pci_dev(dev); /* Please ref to ACPI spec for the syntax of _ADR */ diff --git a/drivers/platform/x86/toshiba_bluetooth.c b/drivers/platform/x86/toshiba_bluetooth.c index a350418e87e..94406861191 100644 --- a/drivers/platform/x86/toshiba_bluetooth.c +++ b/drivers/platform/x86/toshiba_bluetooth.c @@ -57,7 +57,7 @@ static struct acpi_driver toshiba_bt_rfkill_driver = { static int toshiba_bluetooth_enable(acpi_handle handle) { acpi_status res1, res2; - acpi_integer result; + u64 result; /* * Query ACPI to verify RFKill switch is set to 'on'. @@ -95,7 +95,7 @@ static int toshiba_bt_resume(struct acpi_device *device) static int toshiba_bt_rfkill_add(struct acpi_device *device) { acpi_status status; - acpi_integer bt_present; + u64 bt_present; int result = -ENODEV; /* diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index b104302fea0..09e9918c69c 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -796,7 +796,7 @@ static __init acpi_status parse_wdg(acpi_handle handle) */ static acpi_status acpi_wmi_ec_space_handler(u32 function, acpi_physical_address address, - u32 bits, acpi_integer * value, + u32 bits, u64 *value, void *handler_context, void *region_context) { int result = 0, i = 0; @@ -813,7 +813,7 @@ acpi_wmi_ec_space_handler(u32 function, acpi_physical_address address, if (function == ACPI_READ) { result = ec_read(address, &temp); - (*value) |= ((acpi_integer)temp) << i; + (*value) |= ((u64)temp) << i; } else { temp = 0xff & ((*value) >> i); result = ec_write(address, temp); diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index 3cd9ccdcbd8..71c105c9b0e 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -250,8 +250,8 @@ struct acpi_device_wakeup_state { struct acpi_device_wakeup { acpi_handle gpe_device; - acpi_integer gpe_number; - acpi_integer sleep_state; + u64 gpe_number; + u64 sleep_state; struct acpi_handle_list resources; struct acpi_device_wakeup_state state; struct acpi_device_wakeup_flags flags; @@ -380,7 +380,7 @@ struct acpi_pci_root { }; /* helper */ -acpi_handle acpi_get_child(acpi_handle, acpi_integer); +acpi_handle acpi_get_child(acpi_handle, u64); int acpi_is_root_bridge(acpi_handle); acpi_handle acpi_get_pci_rootbridge_handle(unsigned int, unsigned int); struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle); diff --git a/include/acpi/processor.h b/include/acpi/processor.h index 0ea5ef4eb6a..29831768c0e 100644 --- a/include/acpi/processor.h +++ b/include/acpi/processor.h @@ -92,11 +92,11 @@ struct acpi_processor_power { /* Performance Management */ struct acpi_psd_package { - acpi_integer num_entries; - acpi_integer revision; - acpi_integer domain; - acpi_integer coord_type; - acpi_integer num_processors; + u64 num_entries; + u64 revision; + u64 domain; + u64 coord_type; + u64 num_processors; } __attribute__ ((packed)); struct acpi_pct_register { @@ -110,12 +110,12 @@ struct acpi_pct_register { } __attribute__ ((packed)); struct acpi_processor_px { - acpi_integer core_frequency; /* megahertz */ - acpi_integer power; /* milliWatts */ - acpi_integer transition_latency; /* microseconds */ - acpi_integer bus_master_latency; /* microseconds */ - acpi_integer control; /* control value */ - acpi_integer status; /* success indicator */ + u64 core_frequency; /* megahertz */ + u64 power; /* milliWatts */ + u64 transition_latency; /* microseconds */ + u64 bus_master_latency; /* microseconds */ + u64 control; /* control value */ + u64 status; /* success indicator */ }; struct acpi_processor_performance { @@ -133,11 +133,11 @@ struct acpi_processor_performance { /* Throttling Control */ struct acpi_tsd_package { - acpi_integer num_entries; - acpi_integer revision; - acpi_integer domain; - acpi_integer coord_type; - acpi_integer num_processors; + u64 num_entries; + u64 revision; + u64 domain; + u64 coord_type; + u64 num_processors; } __attribute__ ((packed)); struct acpi_ptc_register { @@ -151,11 +151,11 @@ struct acpi_ptc_register { } __attribute__ ((packed)); struct acpi_processor_tx_tss { - acpi_integer freqpercentage; /* */ - acpi_integer power; /* milliWatts */ - acpi_integer transition_latency; /* microseconds */ - acpi_integer control; /* control value */ - acpi_integer status; /* success indicator */ + u64 freqpercentage; /* */ + u64 power; /* milliWatts */ + u64 transition_latency; /* microseconds */ + u64 control; /* control value */ + u64 status; /* success indicator */ }; struct acpi_processor_tx { u16 power; -- cgit v1.2.3-70-g09d2 From 69c89efb51510b3dc0fa336f7fa257c6e1799ee4 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 29 Jan 2010 11:42:20 -0800 Subject: x86, irq: Update the vector domain for legacy irqs handled by io-apic In the recent change of not reserving IRQ0_VECTOR..IRQ15_VECTOR's on all cpu's, we start with irq 0..15 getting directed to (and handled on) cpu-0. In the logical flat mode, once the AP's are online (and before irqbalance comes into picture), kernel intends to handle these IRQ's on any cpu (as the logical flat mode allows to specify multiple cpu's for the irq destination and the chipset based routing can deliver to the interrupt to any one of the specified cpu's). This was broken with our recent change, which was ending up using only cpu 0 as the destination, even when the kernel was specifying to use all online cpu's for the logical flat mode case. Fix this by updating vector allocation domain (cfg->domain) for legacy irqs, when the IO-APIC handles them. Signed-off-by: Suresh Siddha LKML-Reference: <20100129194330.207790269@sbs-t61.sc.intel.com> Tested-by: Li Zefan Cc: Yinghai Lu Cc: Eric W. Biederman Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1a30587a6bc..2430b31c985 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1428,6 +1428,14 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq cfg = desc->chip_data; + /* + * For legacy irqs, cfg->domain starts with cpu 0 for legacy + * controllers like 8259. Now that IO-APIC can handle this irq, update + * the cfg->domain. + */ + if (irq < nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) + apic->vector_allocation_domain(0, cfg->domain); + if (assign_irq_vector(irq, cfg, apic->target_cpus())) return; -- cgit v1.2.3-70-g09d2 From 9d133e5db993d577bd868b54083869fe5479fcff Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 29 Jan 2010 11:42:21 -0800 Subject: x86, irq: Move __setup_vector_irq() before the first irq enable in cpu online path Lowest priority delivery of logical flat mode is broken on some systems, such that even when IO-APIC RTE says deliver the interrupt to a particular CPU, interrupt subsystem delivers the interrupt to totally different CPU. For example, this behavior was observed on a P4 based system with SiS chipset which was reported by Li Zefan. We have been handling this kind of behavior by making sure that in logical flat mode, we assign the same vector to irq mappings on all the 8 possible logical cpu's. But we have been doing this initial assignment (__setup_vector_irq()) a little late (before which interrupts were already enabled for a short duration). Move the __setup_vector_irq() before the first irq enable point in the cpu online path to avoid the issue of not handling some interrupts that wrongly hit the cpu which is still coming online. Signed-off-by: Suresh Siddha LKML-Reference: <20100129194330.283696385@sbs-t61.sc.intel.com> Tested-by: Li Zefan Cc: Yinghai Lu Cc: Eric W. Biederman Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 8 +++++++- arch/x86/kernel/smpboot.c | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 2430b31c985..937150e4c06 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1256,11 +1256,16 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg) void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ - /* This function must be called with vector_lock held */ int irq, vector; struct irq_cfg *cfg; struct irq_desc *desc; + /* + * vector_lock will make sure that we don't run into irq vector + * assignments that might be happening on another cpu in parallel, + * while we setup our initial vector to irq mappings. + */ + spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { cfg = desc->chip_data; @@ -1279,6 +1284,7 @@ void __setup_vector_irq(int cpu) if (!cpumask_test_cpu(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } + spin_unlock(&vector_lock); } static struct irq_chip ioapic_chip; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 678d0b8c26f..b2ebcba729d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -241,6 +241,11 @@ static void __cpuinit smp_callin(void) map_cpu_to_logical_apicid(); notify_cpu_starting(cpuid); + + /* + * Need to setup vector mappings before we enable interrupts. + */ + __setup_vector_irq(smp_processor_id()); /* * Get our bogomips. * @@ -315,7 +320,6 @@ notrace static void __cpuinit start_secondary(void *unused) */ ipi_call_lock(); lock_vector_lock(); - __setup_vector_irq(smp_processor_id()); set_cpu_online(smp_processor_id(), true); unlock_vector_lock(); ipi_call_unlock(); -- cgit v1.2.3-70-g09d2 From 3b9cfc0a99f88c0db7c72363620584a9b40b4543 Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Sun, 31 Jan 2010 20:16:34 +0100 Subject: x86, mtrr: Constify struct mtrr_ops This is part of the ops structure constification effort started by Arjan van de Ven et al. Benefits of this constification: * prevents modification of data that is shared (referenced) by many other structure instances at runtime * detects/prevents accidental (but not intentional) modification attempts on archs that enforce read-only kernel data at runtime * potentially better optimized code as the compiler can assume that the const data cannot be changed * the compiler/linker move const data into .rodata and therefore exclude them from false sharing Signed-off-by: Emese Revfy LKML-Reference: <4B65D712.3080804@gmail.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/amd.c | 2 +- arch/x86/kernel/cpu/mtrr/centaur.c | 2 +- arch/x86/kernel/cpu/mtrr/cyrix.c | 2 +- arch/x86/kernel/cpu/mtrr/generic.c | 2 +- arch/x86/kernel/cpu/mtrr/main.c | 6 +++--- arch/x86/kernel/cpu/mtrr/mtrr.h | 6 +++--- 6 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c index 33af14110df..92ba9cd31c9 100644 --- a/arch/x86/kernel/cpu/mtrr/amd.c +++ b/arch/x86/kernel/cpu/mtrr/amd.c @@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) return 0; } -static struct mtrr_ops amd_mtrr_ops = { +static const struct mtrr_ops amd_mtrr_ops = { .vendor = X86_VENDOR_AMD, .set = amd_set_mtrr, .get = amd_get_mtrr, diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c index de89f14eff3..316fe3e60a9 100644 --- a/arch/x86/kernel/cpu/mtrr/centaur.c +++ b/arch/x86/kernel/cpu/mtrr/centaur.c @@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t return 0; } -static struct mtrr_ops centaur_mtrr_ops = { +static const struct mtrr_ops centaur_mtrr_ops = { .vendor = X86_VENDOR_CENTAUR, .set = centaur_set_mcr, .get = centaur_get_mcr, diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 228d982ce09..68a3343e579 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -265,7 +265,7 @@ static void cyrix_set_all(void) post_set(); } -static struct mtrr_ops cyrix_mtrr_ops = { +static const struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, .set_all = cyrix_set_all, .set = cyrix_set_arr, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 55da0c5f68d..4d755846fee 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -752,7 +752,7 @@ int positive_have_wrcomb(void) /* * Generic structure... */ -struct mtrr_ops generic_mtrr_ops = { +const struct mtrr_ops generic_mtrr_ops = { .use_intel_if = 1, .set_all = generic_set_all, .get = generic_get_mtrr, diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 84e83de5457..fe4622e8c83 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -60,14 +60,14 @@ static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; static bool mtrr_aps_delayed_init; -static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; +static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; -struct mtrr_ops *mtrr_if; +const struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(struct mtrr_ops *ops) +void set_mtrr_ops(const struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index a501dee9a87..df5e41f31a2 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -32,7 +32,7 @@ extern int generic_get_free_region(unsigned long base, unsigned long size, extern int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type); -extern struct mtrr_ops generic_mtrr_ops; +extern const struct mtrr_ops generic_mtrr_ops; extern int positive_have_wrcomb(void); @@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index, u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); void get_mtrr_state(void); -extern void set_mtrr_ops(struct mtrr_ops *ops); +extern void set_mtrr_ops(const struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; -extern struct mtrr_ops *mtrr_if; +extern const struct mtrr_ops *mtrr_if; #define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) #define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) -- cgit v1.2.3-70-g09d2 From 1b5576e69a5fe168c08a159685ac366316ac9bbc Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 22 Jan 2010 11:21:04 +0800 Subject: x86: Remove BIOS data range from e820 In preparation for moving to the generic page_is_ram(), make explicit what we expect to be reserved and not reserved. Tested-by: Wu Fengguang Signed-off-by: Yinghai Lu LKML-Reference: <20100122033004.335813103@intel.com> Cc: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 8 ++++++++ arch/x86/kernel/setup.c | 19 ++++++++++++++++++- arch/x86/mm/ioremap.c | 16 ---------------- 3 files changed, 26 insertions(+), 17 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index d17d482a04f..230687ba5ba 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -517,11 +517,19 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type, int checktype) { int i; + u64 end; u64 real_removed_size = 0; if (size > (ULLONG_MAX - start)) size = ULLONG_MAX - start; + end = start + size; + printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", + (unsigned long long) start, + (unsigned long long) end); + e820_print_type(old_type); + printk(KERN_CONT "\n"); + for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; u64 final_start, final_end; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cdb6a8a506d..f9b1f4e5ab7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -650,6 +650,23 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { {} }; +static void __init trim_bios_range(void) +{ + /* + * A special case is the first 4Kb of memory; + * This is a BIOS owned area, not kernel ram, but generally + * not listed as such in the E820 table. + */ + e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + /* + * special case: Some BIOSen report the PC BIOS + * area (640->1Mb) as ram even though it is not. + * take them out. + */ + e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -813,7 +830,7 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); - + trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 334e63ca7b2..30e068d6462 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -29,22 +29,6 @@ int page_is_ram(unsigned long pagenr) resource_size_t addr, end; int i; - /* - * A special case is the first 4Kb of memory; - * This is a BIOS owned area, not kernel ram, but generally - * not listed as such in the E820 table. - */ - if (pagenr == 0) - return 0; - - /* - * Second special case: Some BIOSen report the PC BIOS - * area (640->1Mb) as ram even though it is not. - */ - if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) && - pagenr < (BIOS_END >> PAGE_SHIFT)) - return 0; - for (i = 0; i < e820.nr_map; i++) { /* * Not usable memory: -- cgit v1.2.3-70-g09d2 From f266d7f5f89652a68e21e9882c44ee9104ad8d61 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 3 Feb 2010 21:21:32 +0200 Subject: x86_64: Print modules like i386 does Print modules list during kernel BUG. Signed-off-by: Alexey Dobriyan Cc: Arjan van de Ven Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 0ad9597073f..907a90e2901 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -291,6 +291,7 @@ void show_registers(struct pt_regs *regs) sp = regs->sp; printk("CPU %d ", cpu); + print_modules(); __show_regs(regs, 1); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", cur->comm, cur->pid, task_thread_info(cur), cur); -- cgit v1.2.3-70-g09d2 From 34d2819f20782feb60f9434470ecfb200875fd41 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Feb 2010 09:51:28 +0100 Subject: x86, mtrr: Remove unused mtrr/state.c The last reference to the helpers in went away with 9a6b344ea967efa0bb5ca4cb5405f840652b66c4 leaving unused code. Remove it. Signed-off-by: Borislav Petkov LKML-Reference: <20100204085128.GA513@liondog.tnic> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/Makefile | 2 +- arch/x86/kernel/cpu/mtrr/state.c | 94 --------------------------------------- 2 files changed, 1 insertion(+), 95 deletions(-) delete mode 100644 arch/x86/kernel/cpu/mtrr/state.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile index f4361b56f8e..ad9e5ed8118 100644 --- a/arch/x86/kernel/cpu/mtrr/Makefile +++ b/arch/x86/kernel/cpu/mtrr/Makefile @@ -1,3 +1,3 @@ -obj-y := main.o if.o generic.o state.o cleanup.o +obj-y := main.o if.o generic.o cleanup.o obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c deleted file mode 100644 index dfc80b4e6b0..00000000000 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ /dev/null @@ -1,94 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include - -#include "mtrr.h" - -/* Put the processor into a state where MTRRs can be safely set */ -void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) -{ - unsigned int cr0; - - /* Disable interrupts locally */ - local_irq_save(ctxt->flags); - - if (use_intel() || is_cpu(CYRIX)) { - - /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if (cpu_has_pge) { - ctxt->cr4val = read_cr4(); - write_cr4(ctxt->cr4val & ~X86_CR4_PGE); - } - - /* - * Disable and flush caches. Note that wbinvd flushes the TLBs - * as a side-effect - */ - cr0 = read_cr0() | X86_CR0_CD; - wbinvd(); - write_cr0(cr0); - wbinvd(); - - if (use_intel()) { - /* Save MTRR state */ - rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); - } else { - /* - * Cyrix ARRs - - * everything else were excluded at the top - */ - ctxt->ccr3 = getCx86(CX86_CCR3); - } - } -} - -void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) -{ - if (use_intel()) { - /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, - ctxt->deftype_hi); - } else { - if (is_cpu(CYRIX)) { - /* Cyrix ARRs - everything else were excluded at the top */ - setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); - } - } -} - -/* Restore the processor after a set_mtrr_prepare */ -void set_mtrr_done(struct set_mtrr_context *ctxt) -{ - if (use_intel() || is_cpu(CYRIX)) { - - /* Flush caches and TLBs */ - wbinvd(); - - /* Restore MTRRdefType */ - if (use_intel()) { - /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, - ctxt->deftype_hi); - } else { - /* - * Cyrix ARRs - - * everything else was excluded at the top - */ - setCx86(CX86_CCR3, ctxt->ccr3); - } - - /* Enable caches */ - write_cr0(read_cr0() & 0xbfffffff); - - /* Restore value of CR4 */ - if (cpu_has_pge) - write_cr4(ctxt->cr4val); - } - /* Re-enable interrupts locally (if enabled previously) */ - local_irq_restore(ctxt->flags); -} -- cgit v1.2.3-70-g09d2 From 17622339af2536b32cf29699ddd4ba0fe79a61d5 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 2 Feb 2010 14:41:39 -0800 Subject: clocksource: add argument to resume callback Pass the clocksource as an argument to the clocksource resume callback. Needed so we can point out which CMT channel the sh_cmt.c driver shall resume. Signed-off-by: Magnus Damm Cc: john stultz Cc: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- arch/ia64/kernel/time.c | 2 +- arch/x86/kernel/hpet.c | 2 +- arch/x86/kernel/tsc.c | 2 +- include/linux/clocksource.h | 2 +- kernel/time/clocksource.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index a35c661e5e8..47a192781b0 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -61,7 +61,7 @@ unsigned long long sched_clock(void) #ifdef CONFIG_PARAVIRT static void -paravirt_clocksource_resume(void) +paravirt_clocksource_resume(struct clocksource *cs) { if (pv_time_ops.clocksource_resume) pv_time_ops.clocksource_resume(); diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ad80a1c718c..ee4fa1bfcb3 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -266,7 +266,7 @@ static void hpet_resume_device(void) force_hpet_resume(); } -static void hpet_resume_counter(void) +static void hpet_resume_counter(struct clocksource *cs) { hpet_resume_device(); hpet_restart_counter(); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 597683aa5ba..9eeb9be26aa 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -740,7 +740,7 @@ static cycle_t __vsyscall_fn vread_tsc(void) } #endif -static void resume_tsc(void) +static void resume_tsc(struct clocksource *cs) { clocksource_tsc.cycle_last = 0; } diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 8a4a130cc19..0de7e72c399 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -172,7 +172,7 @@ struct clocksource { u64 max_idle_ns; unsigned long flags; cycle_t (*vread)(void); - void (*resume)(void); + void (*resume)(struct clocksource *cs); #ifdef CONFIG_IA64 void *fsys_mmio; /* used by fsyscall asm code */ #define CLKSRC_FSYS_MMIO_SET(mmio, addr) ((mmio) = (addr)) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e85c23404d3..08adacb2a1e 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -449,7 +449,7 @@ void clocksource_resume(void) list_for_each_entry(cs, &clocksource_list, list) if (cs->resume) - cs->resume(); + cs->resume(cs); clocksource_resume_watchdog(); } -- cgit v1.2.3-70-g09d2 From 841582ea9e29a8f757c30c5377ce649586ba793a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 2 Feb 2010 14:38:14 -0800 Subject: x86, uv: Update UV arch to target Legacy VGA I/O correctly. Add function to direct Legacy VGA I/O traffic to correct I/O Hub. Signed-off-by: Mike Travis LKML-Reference: <201002022238.o12McEbi018727@imap1.linux-foundation.org> Cc: Thomas Gleixner Cc: Robin Holt Cc: Jack Steiner Cc: Ingo Molnar Cc: Jesse Barnes Cc: David Airlie Signed-off-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/uv/bios.h | 4 +++- arch/x86/kernel/apic/x2apic_uv_x.c | 30 ++++++++++++++++++++++++++++++ arch/x86/kernel/bios_uv.c | 19 +++++++++++++++++++ 3 files changed, 52 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 2751f3075d8..163427597d0 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -36,7 +36,8 @@ enum uv_bios_cmd { UV_BIOS_WATCHLIST_ALLOC, UV_BIOS_WATCHLIST_FREE, UV_BIOS_MEMPROTECT, - UV_BIOS_GET_PARTITION_ADDR + UV_BIOS_GET_PARTITION_ADDR, + UV_BIOS_SET_LEGACY_VGA_TARGET }; /* @@ -96,6 +97,7 @@ extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int, extern int uv_bios_mq_watchlist_free(int, int); extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); +extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus); extern void uv_bios_init(void); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 21db3cbea7d..6ef2899eb86 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -34,6 +35,8 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); +#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args) + static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; int uv_min_hub_revision_id; @@ -553,6 +556,30 @@ late_initcall(uv_init_heartbeat); #endif /* !CONFIG_HOTPLUG_CPU */ +/* Direct Legacy VGA I/O traffic to designated IOH */ +int uv_set_vga_state(struct pci_dev *pdev, bool decode, + unsigned int command_bits, bool change_bridge) +{ + int domain, bus, rc; + + PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n", + pdev->devfn, decode, command_bits, change_bridge); + + if (!change_bridge) + return 0; + + if ((command_bits & PCI_COMMAND_IO) == 0) + return 0; + + domain = pci_domain_nr(pdev->bus); + bus = pdev->bus->number; + + rc = uv_bios_set_legacy_vga_target(decode, domain, bus); + PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc); + + return rc; +} + /* * Called on each cpu to initialize the per_cpu UV data area. * FIXME: hotplug not supported yet @@ -691,4 +718,7 @@ void __init uv_system_init(void) uv_cpu_init(); uv_scir_register_cpu_notifier(); proc_mkdir("sgi_uv", NULL); + + /* register Legacy VGA I/O redirection handler */ + pci_register_set_vga_state(uv_set_vga_state); } diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index b0206a211b0..575127a6e35 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -154,6 +154,25 @@ s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) } EXPORT_SYMBOL_GPL(uv_bios_freq_base); +/* + * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target + * @decode: true to enable target, false to disable target + * @domain: PCI domain number + * @bus: PCI bus number + * + * Returns: + * 0: Success + * -EINVAL: Invalid domain or bus number + * -ENOSYS: Capability not available + * -EBUSY: Legacy VGA I/O cannot be retargeted at this time + */ +int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) +{ + return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET, + (u64)decode, (u64)domain, (u64)bus, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); + #ifdef CONFIG_EFI void uv_bios_init(void) -- cgit v1.2.3-70-g09d2 From 3235dc3f22378f35ce77eba0d0f62db2d9c4844e Mon Sep 17 00:00:00 2001 From: Frans Pop Date: Sat, 6 Feb 2010 18:47:17 +0100 Subject: x86: Remove trailing spaces in messages Signed-off-by: Frans Pop Cc: Avi Kivity Cc: x86@kernel.org LKML-Reference: <1265478443-31072-10-git-send-email-elendil@planet.nl> [ Left out the KVM bits. ] Signed-off-by: Ingo Molnar --- arch/x86/boot/mkcpustr.c | 2 +- arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/apic/numaq_32.c | 2 +- arch/x86/kernel/apm_32.c | 4 ++-- arch/x86/kernel/efi.c | 2 +- arch/x86/kernel/microcode_intel.c | 2 +- arch/x86/kernel/uv_sysfs.c | 6 +++--- arch/x86/tools/test_get_len.c | 4 ++-- 9 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index 8ef60f20b37..919257f526f 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -22,7 +22,7 @@ int main(void) int i, j; const char *str; - printf("static const char x86_cap_strs[] = \n"); + printf("static const char x86_cap_strs[] =\n"); for (i = 0; i < NCAPINTS; i++) { for (j = 0; j < 32; j++) { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e80f291472a..71c4443bb91 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -587,7 +587,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) res = (((u64)(*deltatsc)) * pm_100ms); do_div(res, deltapm); apic_printk(APIC_VERBOSE, "TSC delta adjusted to " - "PM-Timer: %lu (%ld) \n", + "PM-Timer: %lu (%ld)\n", (unsigned long)res, *deltatsc); *deltatsc = (long)res; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 53243ca7816..6bdd2c7ead7 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1647,7 +1647,7 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG ".... IRQ redirection table:\n"); printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" - " Stat Dmod Deli Vect: \n"); + " Stat Dmod Deli Vect:\n"); for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 98c4665f251..47dd856708e 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -225,7 +225,7 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc) mpc_record = 0; printk(KERN_INFO - "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); + "Found an OEM MPC table at %8p - parsing it...\n", oemtable); if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { printk(KERN_WARNING diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index b5b6b23bce5..031aa887b0e 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1992,8 +1992,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d) apm_info.disabled = 1; printk(KERN_INFO "%s machine detected. " "Disabling APM.\n", d->ident); - printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); - printk(KERN_INFO "download from support.intel.com \n"); + printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n"); + printk(KERN_INFO "download from support.intel.com\n"); } return 0; } diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index cdcfb122f25..c2fa9b8b497 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -362,7 +362,7 @@ void __init efi_init(void) printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); early_iounmap(tmp, 2); - printk(KERN_INFO "EFI v%u.%.02u by %s \n", + printk(KERN_INFO "EFI v%u.%.02u by %s\n", efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index ebd193e476c..85a343e2893 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -328,7 +328,7 @@ static int apply_microcode(int cpu) cpu_num, mc_intel->hdr.rev); return -1; } - pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x \n", + pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c index 36afb98675a..309c70fb775 100644 --- a/arch/x86/kernel/uv_sysfs.c +++ b/arch/x86/kernel/uv_sysfs.c @@ -54,19 +54,19 @@ static int __init sgi_uv_sysfs_init(void) if (!sgi_uv_kobj) sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); if (!sgi_uv_kobj) { - printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); + printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n"); return -EINVAL; } ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); if (ret) { - printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); + printk(KERN_WARNING "sysfs_create_file partition_id failed\n"); return ret; } ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); if (ret) { - printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); + printk(KERN_WARNING "sysfs_create_file coherence_id failed\n"); return ret; } diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index bee8d6ac269..13403fc95a9 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -43,7 +43,7 @@ static int x86_64; static void usage(void) { fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |" - " %s [-y|-n] [-v] \n", prog); + " %s [-y|-n] [-v]\n", prog); fprintf(stderr, "\t-y 64bit mode\n"); fprintf(stderr, "\t-n 32bit mode\n"); fprintf(stderr, "\t-v verbose mode\n"); @@ -69,7 +69,7 @@ static void dump_field(FILE *fp, const char *name, const char *indent, static void dump_insn(FILE *fp, struct insn *insn) { - fprintf(fp, "Instruction = { \n"); + fprintf(fp, "Instruction = {\n"); dump_field(fp, "prefixes", "\t", &insn->prefixes); dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix); dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix); -- cgit v1.2.3-70-g09d2 From 0271f91003d3703675be13b8865618359a6caa1f Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Thu, 4 Feb 2010 19:06:33 +0800 Subject: x86, acpi: Map hotadded cpu to correct node. When hotadd new cpu to system, if its affinitive node is online, should map the cpu to its own node. Otherwise, let kernel select one online node for the new cpu later. Signed-off-by: Haicheng Li LKML-Reference: <4B6AAA39.6000300@linux.intel.com> Tested-by: Thomas Renninger Signed-off-by: H. Peter Anvin --- arch/x86/kernel/acpi/boot.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 036d28adf59..7db15e161aa 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -49,6 +49,7 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 # include +# include #endif /* X86 */ #define BAD_MADT_ENTRY(entry, end) ( \ @@ -482,6 +483,25 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) */ #ifdef CONFIG_ACPI_HOTPLUG_CPU +static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +{ +#ifdef CONFIG_ACPI_NUMA + int nid; + + nid = acpi_get_node(handle); + if (nid == -1 || !node_online(nid)) + return; +#ifdef CONFIG_X86_64 + apicid_to_node[physid] = nid; + numa_set_node(cpu, nid); +#else /* CONFIG_X86_32 */ + apicid_2_node[physid] = nid; + cpu_to_node_map[cpu] = nid; +#endif + +#endif +} + static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; @@ -540,6 +560,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) } cpu = cpumask_first(new_map); + acpi_map_cpu2node(handle, cpu, physid); *pcpu = cpu; retval = 0; -- cgit v1.2.3-70-g09d2 From 18dce6ba5c8c6bd0f3ab4efa4cbdd698dab5c40a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:05 -0800 Subject: x86: Fix SCI on IOAPIC != 0 Thomas Renninger reported on IBM x3330 booting a latest kernel on this machine results in: PCI: PCI BIOS revision 2.10 entry at 0xfd61c, last bus=1 PCI: Using configuration type 1 for base access bio: create slab at 0 ACPI: SCI (IRQ30) allocation failed ACPI Exception: AE_NOT_ACQUIRED, Unable to install System Control Interrupt handler (20090903/evevent-161) ACPI: Unable to start the ACPI Interpreter Later all kind of devices fail... and bisect it down to this commit: commit b9c61b70075c87a8612624736faf4a2de5b1ed30 x86/pci: update pirq_enable_irq() to setup io apic routing it turns out we need to set irq routing for the sci on ioapic1 early. -v2: make it work without sparseirq too. -v3: fix checkpatch.pl warning, and cc to stable Reported-by: Thomas Renninger Bisected-by: Thomas Renninger Tested-by: Thomas Renninger Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-2-git-send-email-yinghai@kernel.org> Cc: stable@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/io_apic.h | 1 + arch/x86/kernel/acpi/boot.c | 9 +++++++- arch/x86/kernel/apic/io_apic.c | 50 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 7c7c16cde1f..5f61f6e0ffd 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -160,6 +160,7 @@ extern int io_apic_get_redir_entries(int ioapic); struct io_apic_irq_attr; extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); +void setup_IO_APIC_irq_extra(u32 gsi); extern int (*ioapic_renumber_irq)(int ioapic, int irq); extern void ioapic_init_mappings(void); extern void ioapic_insert_resources(void); diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0acbcdfa5ca..5c96b75c6ea 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -446,6 +446,12 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) { *irq = gsi; + +#ifdef CONFIG_X86_IO_APIC + if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) + setup_IO_APIC_irq_extra(gsi); +#endif + return 0; } @@ -473,7 +479,8 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); } #endif - acpi_gsi_to_irq(plat_gsi, &irq); + irq = plat_gsi; + return irq; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 53243ca7816..5e4cce254e4 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1538,6 +1538,56 @@ static void __init setup_IO_APIC_irqs(void) " (apicid-pin) not connected\n"); } +/* + * for the gsit that is not in first ioapic + * but could not use acpi_register_gsi() + * like some special sci in IBM x3330 + */ +void setup_IO_APIC_irq_extra(u32 gsi) +{ + int apic_id = 0, pin, idx, irq; + int node = cpu_to_node(boot_cpu_id); + struct irq_desc *desc; + struct irq_cfg *cfg; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + apic_id = mp_find_ioapic(gsi); + if (apic_id < 0) + return; + + pin = mp_find_ioapic_pin(apic_id, gsi); + idx = find_irq_entry(apic_id, pin, mp_INT); + if (idx == -1) + return; + + irq = pin_2_irq(idx, apic_id, pin); +#ifdef CONFIG_SPARSE_IRQ + desc = irq_to_desc(irq); + if (desc) + return; +#endif + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + return; + } + + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, apic_id, pin); + + if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[apic_id].apicid, pin); + return; + } + set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); + + setup_IO_APIC_irq(apic_id, pin, irq, desc, + irq_trigger(idx), irq_polarity(idx)); +} + /* * Set up the timer pin, possibly with the 8259A-master behind. */ -- cgit v1.2.3-70-g09d2 From ced5b697a76d325e7a7ac7d382dbbb632c765093 Mon Sep 17 00:00:00 2001 From: Brandon Phiilps Date: Wed, 10 Feb 2010 01:20:06 -0800 Subject: x86: Avoid race condition in pci_enable_msix() Keep chip_data in create_irq_nr and destroy_irq. When two drivers are setting up MSI-X at the same time via pci_enable_msix() there is a race. See this dmesg excerpt: [ 85.170610] ixgbe 0000:02:00.1: irq 97 for MSI/MSI-X [ 85.170611] alloc irq_desc for 99 on node -1 [ 85.170613] igb 0000:08:00.1: irq 98 for MSI/MSI-X [ 85.170614] alloc kstat_irqs on node -1 [ 85.170616] alloc irq_2_iommu on node -1 [ 85.170617] alloc irq_desc for 100 on node -1 [ 85.170619] alloc kstat_irqs on node -1 [ 85.170621] alloc irq_2_iommu on node -1 [ 85.170625] ixgbe 0000:02:00.1: irq 99 for MSI/MSI-X [ 85.170626] alloc irq_desc for 101 on node -1 [ 85.170628] igb 0000:08:00.1: irq 100 for MSI/MSI-X [ 85.170630] alloc kstat_irqs on node -1 [ 85.170631] alloc irq_2_iommu on node -1 [ 85.170635] alloc irq_desc for 102 on node -1 [ 85.170636] alloc kstat_irqs on node -1 [ 85.170639] alloc irq_2_iommu on node -1 [ 85.170646] BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 As you can see igb and ixgbe are both alternating on create_irq_nr() via pci_enable_msix() in their probe function. ixgbe: While looping through irq_desc_ptrs[] via create_irq_nr() ixgbe choses irq_desc_ptrs[102] and exits the loop, drops vector_lock and calls dynamic_irq_init. Then it sets irq_desc_ptrs[102]->chip_data = NULL via dynamic_irq_init(). igb: Grabs the vector_lock now and starts looping over irq_desc_ptrs[] via create_irq_nr(). It gets to irq_desc_ptrs[102] and does this: cfg_new = irq_desc_ptrs[102]->chip_data; if (cfg_new->vector != 0) continue; This hits the NULL deref. Another possible race exists via pci_disable_msix() in a driver or in the number of error paths that call free_msi_irqs(): destroy_irq() dynamic_irq_cleanup() which sets desc->chip_data = NULL ...race window... desc->chip_data = cfg; Remove the save and restore code for cfg in create_irq_nr() and destroy_irq() and take the desc->lock when checking the irq_cfg. Reported-and-analyzed-by: Brandon Philips Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-3-git-send-email-yinghai@kernel.org> Signed-off-by: Brandon Phililps Cc: stable@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 18 ++++----------- include/linux/irq.h | 2 ++ kernel/irq/chip.c | 52 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 50 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 53243ca7816..c86591b906f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3228,12 +3228,9 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) } spin_unlock_irqrestore(&vector_lock, flags); - if (irq > 0) { - dynamic_irq_init(irq); - /* restore it, in case dynamic_irq_init clear it */ - if (desc_new) - desc_new->chip_data = cfg_new; - } + if (irq > 0) + dynamic_irq_init_keep_chip_data(irq); + return irq; } @@ -3256,17 +3253,12 @@ void destroy_irq(unsigned int irq) { unsigned long flags; struct irq_cfg *cfg; - struct irq_desc *desc; - /* store it, in case dynamic_irq_cleanup clear it */ - desc = irq_to_desc(irq); - cfg = desc->chip_data; - dynamic_irq_cleanup(irq); - /* connect back irq_cfg */ - desc->chip_data = cfg; + dynamic_irq_cleanup_keep_chip_data(irq); free_irte(irq); spin_lock_irqsave(&vector_lock, flags); + cfg = irq_to_desc(irq)->chip_data; __clear_irq_vector(irq, cfg); spin_unlock_irqrestore(&vector_lock, flags); } diff --git a/include/linux/irq.h b/include/linux/irq.h index 451481c082b..4d9b26e044b 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -400,7 +400,9 @@ static inline int irq_has_action(unsigned int irq) /* Dynamic irq helper functions */ extern void dynamic_irq_init(unsigned int irq); +void dynamic_irq_init_keep_chip_data(unsigned int irq); extern void dynamic_irq_cleanup(unsigned int irq); +void dynamic_irq_cleanup_keep_chip_data(unsigned int irq); /* Set/get chip/data for an IRQ: */ extern int set_irq_chip(unsigned int irq, struct irq_chip *chip); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ecc3fa28f66..d70394f12ee 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -18,11 +18,7 @@ #include "internals.h" -/** - * dynamic_irq_init - initialize a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_init(unsigned int irq) +static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) { struct irq_desc *desc; unsigned long flags; @@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq) desc->depth = 1; desc->msi_desc = NULL; desc->handler_data = NULL; - desc->chip_data = NULL; + if (!keep_chip_data) + desc->chip_data = NULL; desc->action = NULL; desc->irq_count = 0; desc->irqs_unhandled = 0; @@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq) } /** - * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * dynamic_irq_init - initialize a dynamically allocated irq * @irq: irq number to initialize */ -void dynamic_irq_cleanup(unsigned int irq) +void dynamic_irq_init(unsigned int irq) +{ + dynamic_irq_init_x(irq, false); +} + +/** + * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq + * @irq: irq number to initialize + * + * does not set irq_to_desc(irq)->chip_data to NULL + */ +void dynamic_irq_init_keep_chip_data(unsigned int irq) +{ + dynamic_irq_init_x(irq, true); +} + +static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq) } desc->msi_desc = NULL; desc->handler_data = NULL; - desc->chip_data = NULL; + if (!keep_chip_data) + desc->chip_data = NULL; desc->handle_irq = handle_bad_irq; desc->chip = &no_irq_chip; desc->name = NULL; @@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq) raw_spin_unlock_irqrestore(&desc->lock, flags); } +/** + * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * @irq: irq number to initialize + */ +void dynamic_irq_cleanup(unsigned int irq) +{ + dynamic_irq_cleanup_x(irq, false); +} + +/** + * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq + * @irq: irq number to initialize + * + * does not set irq_to_desc(irq)->chip_data to NULL + */ +void dynamic_irq_cleanup_keep_chip_data(unsigned int irq) +{ + dynamic_irq_cleanup_x(irq, true); +} + /** * set_irq_chip - set the irq chip for an irq -- cgit v1.2.3-70-g09d2 From 27811d8cabe56e0c3622251b049086f49face4ff Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:07 -0800 Subject: x86: Move range related operation to one file We have almost the same code for mtrr cleanup and amd_bus checkup, and this code will also be used in replacing bootmem with early_res, so try to move them together and reuse it from different parts. Also rename update_range to subtract_range as that is what the function is actually doing. -v2: update comments as Christoph requested Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-4-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/cleanup.c | 180 ++++--------------------------------- arch/x86/kernel/mmconf-fam10h_64.c | 7 +- arch/x86/pci/amd_bus.c | 70 +++------------ include/linux/range.h | 22 +++++ kernel/Makefile | 2 +- kernel/range.c | 163 +++++++++++++++++++++++++++++++++ 6 files changed, 214 insertions(+), 230 deletions(-) create mode 100644 include/linux/range.h create mode 100644 kernel/range.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 09b1698e046..669da09ab9a 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -22,10 +22,10 @@ #include #include #include -#include #include #include #include +#include #include #include @@ -34,11 +34,6 @@ #include "mtrr.h" -struct res_range { - unsigned long start; - unsigned long end; -}; - struct var_mtrr_range_state { unsigned long base_pfn; unsigned long size_pfn; @@ -56,7 +51,7 @@ struct var_mtrr_state { /* Should be related to MTRR_VAR_RANGES nums */ #define RANGE_NUM 256 -static struct res_range __initdata range[RANGE_NUM]; +static struct range __initdata range[RANGE_NUM]; static int __initdata nr_range; static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; @@ -64,152 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; static int __initdata debug_print; #define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) - -static int __init -add_range(struct res_range *range, int nr_range, - unsigned long start, unsigned long end) -{ - /* Out of slots: */ - if (nr_range >= RANGE_NUM) - return nr_range; - - range[nr_range].start = start; - range[nr_range].end = end; - - nr_range++; - - return nr_range; -} - -static int __init -add_range_with_merge(struct res_range *range, int nr_range, - unsigned long start, unsigned long end) -{ - int i; - - /* Try to merge it with old one: */ - for (i = 0; i < nr_range; i++) { - unsigned long final_start, final_end; - unsigned long common_start, common_end; - - if (!range[i].end) - continue; - - common_start = max(range[i].start, start); - common_end = min(range[i].end, end); - if (common_start > common_end + 1) - continue; - - final_start = min(range[i].start, start); - final_end = max(range[i].end, end); - - range[i].start = final_start; - range[i].end = final_end; - return nr_range; - } - - /* Need to add it: */ - return add_range(range, nr_range, start, end); -} - -static void __init -subtract_range(struct res_range *range, unsigned long start, unsigned long end) -{ - int i, j; - - for (j = 0; j < RANGE_NUM; j++) { - if (!range[j].end) - continue; - - if (start <= range[j].start && end >= range[j].end) { - range[j].start = 0; - range[j].end = 0; - continue; - } - - if (start <= range[j].start && end < range[j].end && - range[j].start < end + 1) { - range[j].start = end + 1; - continue; - } - - - if (start > range[j].start && end >= range[j].end && - range[j].end > start - 1) { - range[j].end = start - 1; - continue; - } - - if (start > range[j].start && end < range[j].end) { - /* Find the new spare: */ - for (i = 0; i < RANGE_NUM; i++) { - if (range[i].end == 0) - break; - } - if (i < RANGE_NUM) { - range[i].end = range[j].end; - range[i].start = end + 1; - } else { - printk(KERN_ERR "run of slot in ranges\n"); - } - range[j].end = start - 1; - continue; - } - } -} - -static int __init cmp_range(const void *x1, const void *x2) -{ - const struct res_range *r1 = x1; - const struct res_range *r2 = x2; - long start1, start2; - - start1 = r1->start; - start2 = r2->start; - - return start1 - start2; -} - -static int __init clean_sort_range(struct res_range *range, int az) -{ - int i, j, k = az - 1, nr_range = 0; - - for (i = 0; i < k; i++) { - if (range[i].end) - continue; - for (j = k; j > i; j--) { - if (range[j].end) { - k = j; - break; - } - } - if (j == i) - break; - range[i].start = range[k].start; - range[i].end = range[k].end; - range[k].start = 0; - range[k].end = 0; - k--; - } - /* count it */ - for (i = 0; i < az; i++) { - if (!range[i].end) { - nr_range = i; - break; - } - } - - /* sort them */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); - - return nr_range; -} - #define BIOS_BUG_MSG KERN_WARNING \ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" static int __init -x86_get_mtrr_mem_range(struct res_range *range, int nr_range, +x86_get_mtrr_mem_range(struct range *range, int nr_range, unsigned long extra_remove_base, unsigned long extra_remove_size) { @@ -223,13 +77,13 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, continue; base = range_state[i].base_pfn; size = range_state[i].size_pfn; - nr_range = add_range_with_merge(range, nr_range, base, - base + size - 1); + nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, + base, base + size - 1); } if (debug_print) { printk(KERN_DEBUG "After WB checking\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end + 1); } @@ -252,10 +106,10 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, size -= (1<<(20-PAGE_SHIFT)) - base; base = 1<<(20-PAGE_SHIFT); } - subtract_range(range, base, base + size - 1); + subtract_range(range, RANGE_NUM, base, base + size - 1); } if (extra_remove_size) - subtract_range(range, extra_remove_base, + subtract_range(range, RANGE_NUM, extra_remove_base, extra_remove_base + extra_remove_size - 1); if (debug_print) { @@ -263,7 +117,7 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, for (i = 0; i < RANGE_NUM; i++) { if (!range[i].end) continue; - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end + 1); } } @@ -273,20 +127,16 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, if (debug_print) { printk(KERN_DEBUG "After sorting\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end + 1); } - /* clear those is not used */ - for (i = nr_range; i < RANGE_NUM; i++) - memset(&range[i], 0, sizeof(range[i])); - return nr_range; } #ifdef CONFIG_MTRR_SANITIZER -static unsigned long __init sum_ranges(struct res_range *range, int nr_range) +static unsigned long __init sum_ranges(struct range *range, int nr_range) { unsigned long sum = 0; int i; @@ -621,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg) early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); static int __init -x86_setup_var_mtrrs(struct res_range *range, int nr_range, +x86_setup_var_mtrrs(struct range *range, int nr_range, u64 chunk_size, u64 gran_size) { struct var_mtrr_state var_state; @@ -742,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size, unsigned long x_remove_base, unsigned long x_remove_size, int i) { - static struct res_range range_new[RANGE_NUM]; + static struct range range_new[RANGE_NUM]; unsigned long range_sums_new; static int nr_range_new; int num_reg; @@ -869,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits) * [0, 1M) should always be covered by var mtrr with WB * and fixed mtrrs should take effect before var mtrr for it: */ - nr_range = add_range_with_merge(range, nr_range, 0, + nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0, (1ULL<<(20 - PAGE_SHIFT)) - 1); /* Sort the ranges: */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + sort_range(range, nr_range); range_sums = sum_ranges(range, nr_range); printk(KERN_INFO "total RAM covered: %ldM\n", diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index 712d15fdc41..71825806cd4 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -7,6 +7,8 @@ #include #include #include +#include + #include #include #include @@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, }; -struct range { - u64 start; - u64 end; -}; - static int __cpuinit cmp_range(const void *x1, const void *x2) { const struct range *r1 = x1; diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 95ecbd49595..2356ea18697 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -2,6 +2,8 @@ #include #include #include +#include + #include #ifdef CONFIG_X86_64 @@ -17,58 +19,6 @@ #ifdef CONFIG_X86_64 -#define RANGE_NUM 16 - -struct res_range { - size_t start; - size_t end; -}; - -static void __init update_range(struct res_range *range, size_t start, - size_t end) -{ - int i; - int j; - - for (j = 0; j < RANGE_NUM; j++) { - if (!range[j].end) - continue; - - if (start <= range[j].start && end >= range[j].end) { - range[j].start = 0; - range[j].end = 0; - continue; - } - - if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) { - range[j].start = end + 1; - continue; - } - - - if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) { - range[j].end = start - 1; - continue; - } - - if (start > range[j].start && end < range[j].end) { - /* find the new spare */ - for (i = 0; i < RANGE_NUM; i++) { - if (range[i].end == 0) - break; - } - if (i < RANGE_NUM) { - range[i].end = range[j].end; - range[i].start = end + 1; - } else { - printk(KERN_ERR "run of slot in ranges\n"); - } - range[j].end = start - 1; - continue; - } - } -} - struct pci_hostbridge_probe { u32 bus; u32 slot; @@ -111,6 +61,8 @@ static void __init get_pci_mmcfg_amd_fam10h_range(void) fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1; } +#define RANGE_NUM 16 + /** * early_fill_mp_bus_to_node() * called before pcibios_scan_root and pci_scan_bus @@ -132,7 +84,7 @@ static int __init early_fill_mp_bus_info(void) struct resource *res; size_t start; size_t end; - struct res_range range[RANGE_NUM]; + struct range range[RANGE_NUM]; u64 val; u32 address; @@ -226,7 +178,7 @@ static int __init early_fill_mp_bus_info(void) if (end > 0xffff) end = 0xffff; update_res(info, start, end, IORESOURCE_IO, 1); - update_range(range, start, end); + subtract_range(range, RANGE_NUM, start, end); } /* add left over io port range to def node/link, [0, 0xffff] */ /* find the position */ @@ -256,14 +208,14 @@ static int __init early_fill_mp_bus_info(void) end = (val & 0xffffff800000ULL); printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); if (end < (1ULL<<32)) - update_range(range, 0, end - 1); + subtract_range(range, RANGE_NUM, 0, end - 1); /* get mmconfig */ get_pci_mmcfg_amd_fam10h_range(); /* need to take out mmconf range */ if (fam10h_mmconf_end) { printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); - update_range(range, fam10h_mmconf_start, fam10h_mmconf_end); + subtract_range(range, RANGE_NUM, fam10h_mmconf_start, fam10h_mmconf_end); } /* mmio resource */ @@ -318,7 +270,7 @@ static int __init early_fill_mp_bus_info(void) /* we got a hole */ endx = fam10h_mmconf_start - 1; update_res(info, start, endx, IORESOURCE_MEM, 0); - update_range(range, start, endx); + subtract_range(range, RANGE_NUM, start, endx); printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx); start = fam10h_mmconf_end + 1; changed = 1; @@ -334,7 +286,7 @@ static int __init early_fill_mp_bus_info(void) } update_res(info, start, end, IORESOURCE_MEM, 1); - update_range(range, start, end); + subtract_range(range, RANGE_NUM, start, end); printk(KERN_CONT "\n"); } @@ -349,7 +301,7 @@ static int __init early_fill_mp_bus_info(void) rdmsrl(address, val); end = (val & 0xffffff800000ULL); printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); - update_range(range, 1ULL<<32, end - 1); + subtract_range(range, RANGE_NUM, 1ULL<<32, end - 1); } /* diff --git a/include/linux/range.h b/include/linux/range.h new file mode 100644 index 00000000000..0789f1412e1 --- /dev/null +++ b/include/linux/range.h @@ -0,0 +1,22 @@ +#ifndef _LINUX_RANGE_H +#define _LINUX_RANGE_H + +struct range { + u64 start; + u64 end; +}; + +int add_range(struct range *range, int az, int nr_range, + u64 start, u64 end); + + +int add_range_with_merge(struct range *range, int az, int nr_range, + u64 start, u64 end); + +void subtract_range(struct range *range, int az, u64 start, u64 end); + +int clean_sort_range(struct range *range, int az); + +void sort_range(struct range *range, int nr_range); + +#endif diff --git a/kernel/Makefile b/kernel/Makefile index 864ff75d65f..ad47330ccf3 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ - async.o + async.o range.o obj-y += groups.o ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/range.c b/kernel/range.c new file mode 100644 index 00000000000..71e0021281f --- /dev/null +++ b/kernel/range.c @@ -0,0 +1,163 @@ +/* + * Range add and subtract + */ +#include +#include +#include + +#include + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#endif + +int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) +{ + if (start > end) + return nr_range; + + /* Out of slots: */ + if (nr_range >= az) + return nr_range; + + range[nr_range].start = start; + range[nr_range].end = end; + + nr_range++; + + return nr_range; +} + +int add_range_with_merge(struct range *range, int az, int nr_range, + u64 start, u64 end) +{ + int i; + + if (start > end) + return nr_range; + + /* Try to merge it with old one: */ + for (i = 0; i < nr_range; i++) { + u64 final_start, final_end; + u64 common_start, common_end; + + if (!range[i].end) + continue; + + common_start = max(range[i].start, start); + common_end = min(range[i].end, end); + if (common_start > common_end + 1) + continue; + + final_start = min(range[i].start, start); + final_end = max(range[i].end, end); + + range[i].start = final_start; + range[i].end = final_end; + return nr_range; + } + + /* Need to add it: */ + return add_range(range, az, nr_range, start, end); +} + +void subtract_range(struct range *range, int az, u64 start, u64 end) +{ + int i, j; + + if (start > end) + return; + + for (j = 0; j < az; j++) { + if (!range[j].end) + continue; + + if (start <= range[j].start && end >= range[j].end) { + range[j].start = 0; + range[j].end = 0; + continue; + } + + if (start <= range[j].start && end < range[j].end && + range[j].start < end + 1) { + range[j].start = end + 1; + continue; + } + + + if (start > range[j].start && end >= range[j].end && + range[j].end > start - 1) { + range[j].end = start - 1; + continue; + } + + if (start > range[j].start && end < range[j].end) { + /* Find the new spare: */ + for (i = 0; i < az; i++) { + if (range[i].end == 0) + break; + } + if (i < az) { + range[i].end = range[j].end; + range[i].start = end + 1; + } else { + printk(KERN_ERR "run of slot in ranges\n"); + } + range[j].end = start - 1; + continue; + } + } +} + +static int cmp_range(const void *x1, const void *x2) +{ + const struct range *r1 = x1; + const struct range *r2 = x2; + s64 start1, start2; + + start1 = r1->start; + start2 = r2->start; + + return start1 - start2; +} + +int clean_sort_range(struct range *range, int az) +{ + int i, j, k = az - 1, nr_range = 0; + + for (i = 0; i < k; i++) { + if (range[i].end) + continue; + for (j = k; j > i; j--) { + if (range[j].end) { + k = j; + break; + } + } + if (j == i) + break; + range[i].start = range[k].start; + range[i].end = range[k].end; + range[k].start = 0; + range[k].end = 0; + k--; + } + /* count it */ + for (i = 0; i < az; i++) { + if (!range[i].end) { + nr_range = i; + break; + } + } + + /* sort them */ + sort(range, nr_range, sizeof(struct range), cmp_range, NULL); + + return nr_range; +} + +void sort_range(struct range *range, int nr_range) +{ + /* sort them */ + sort(range, nr_range, sizeof(struct range), cmp_range, NULL); +} -- cgit v1.2.3-70-g09d2 From e9a0064ad03b899938059bb576615ad9ed0f27f9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:13 -0800 Subject: x86: Change range end to start+size So make interface more consistent with early_res. Later we can share some code with early_res. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-10-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/cleanup.c | 32 ++++++++++++++++---------------- arch/x86/pci/amd_bus.c | 24 ++++++++++++++---------- kernel/range.c | 20 ++++++++++---------- 3 files changed, 40 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 669da09ab9a..06130b52f01 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -78,13 +78,13 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, base = range_state[i].base_pfn; size = range_state[i].size_pfn; nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, - base, base + size - 1); + base, base + size); } if (debug_print) { printk(KERN_DEBUG "After WB checking\n"); for (i = 0; i < nr_range; i++) printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", - range[i].start, range[i].end + 1); + range[i].start, range[i].end); } /* Take out UC ranges: */ @@ -106,11 +106,11 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, size -= (1<<(20-PAGE_SHIFT)) - base; base = 1<<(20-PAGE_SHIFT); } - subtract_range(range, RANGE_NUM, base, base + size - 1); + subtract_range(range, RANGE_NUM, base, base + size); } if (extra_remove_size) subtract_range(range, RANGE_NUM, extra_remove_base, - extra_remove_base + extra_remove_size - 1); + extra_remove_base + extra_remove_size); if (debug_print) { printk(KERN_DEBUG "After UC checking\n"); @@ -118,7 +118,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, if (!range[i].end) continue; printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", - range[i].start, range[i].end + 1); + range[i].start, range[i].end); } } @@ -128,7 +128,7 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range, printk(KERN_DEBUG "After sorting\n"); for (i = 0; i < nr_range; i++) printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", - range[i].start, range[i].end + 1); + range[i].start, range[i].end); } return nr_range; @@ -142,7 +142,7 @@ static unsigned long __init sum_ranges(struct range *range, int nr_range) int i; for (i = 0; i < nr_range; i++) - sum += range[i].end + 1 - range[i].start; + sum += range[i].end - range[i].start; return sum; } @@ -489,7 +489,7 @@ x86_setup_var_mtrrs(struct range *range, int nr_range, /* Write the range: */ for (i = 0; i < nr_range; i++) { set_var_mtrr_range(&var_state, range[i].start, - range[i].end - range[i].start + 1); + range[i].end - range[i].start); } /* Write the last range: */ @@ -720,7 +720,7 @@ int __init mtrr_cleanup(unsigned address_bits) * and fixed mtrrs should take effect before var mtrr for it: */ nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0, - (1ULL<<(20 - PAGE_SHIFT)) - 1); + 1ULL<<(20 - PAGE_SHIFT)); /* Sort the ranges: */ sort_range(range, nr_range); @@ -939,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) nr_range = 0; if (mtrr_tom2) { range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); - range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; - if (highest_pfn < range[nr_range].end + 1) - highest_pfn = range[nr_range].end + 1; + range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT; + if (highest_pfn < range[nr_range].end) + highest_pfn = range[nr_range].end; nr_range++; } nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); @@ -953,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* Check the holes: */ for (i = 0; i < nr_range - 1; i++) { - if (range[i].end + 1 < range[i+1].start) - total_trim_size += real_trim_memory(range[i].end + 1, + if (range[i].end < range[i+1].start) + total_trim_size += real_trim_memory(range[i].end, range[i+1].start); } /* Check the top: */ i = nr_range - 1; - if (range[i].end + 1 < end_pfn) - total_trim_size += real_trim_memory(range[i].end + 1, + if (range[i].end < end_pfn) + total_trim_size += real_trim_memory(range[i].end, end_pfn); if (total_trim_size) { diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index ea6072fcf3d..fc1e8fe07e5 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -145,7 +145,7 @@ static int __init early_fill_mp_bus_info(void) def_link = (reg >> 8) & 0x03; memset(range, 0, sizeof(range)); - range[0].end = 0xffff; + add_range(range, RANGE_NUM, 0, 0, 0xffff + 1); /* io port resource */ for (i = 0; i < 4; i++) { reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3)); @@ -175,7 +175,7 @@ static int __init early_fill_mp_bus_info(void) if (end > 0xffff) end = 0xffff; update_res(info, start, end, IORESOURCE_IO, 1); - subtract_range(range, RANGE_NUM, start, end); + subtract_range(range, RANGE_NUM, start, end + 1); } /* add left over io port range to def node/link, [0, 0xffff] */ /* find the position */ @@ -190,14 +190,16 @@ static int __init early_fill_mp_bus_info(void) if (!range[i].end) continue; - update_res(info, range[i].start, range[i].end, + update_res(info, range[i].start, range[i].end - 1, IORESOURCE_IO, 1); } } memset(range, 0, sizeof(range)); /* 0xfd00000000-0xffffffffff for HT */ - range[0].end = cap_resource((0xfdULL<<32) - 1); + end = cap_resource((0xfdULL<<32) - 1); + end++; + add_range(range, RANGE_NUM, 0, 0, end); /* need to take out [0, TOM) for RAM*/ address = MSR_K8_TOP_MEM1; @@ -205,14 +207,15 @@ static int __init early_fill_mp_bus_info(void) end = (val & 0xffffff800000ULL); printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20); if (end < (1ULL<<32)) - subtract_range(range, RANGE_NUM, 0, end - 1); + subtract_range(range, RANGE_NUM, 0, end); /* get mmconfig */ get_pci_mmcfg_amd_fam10h_range(); /* need to take out mmconf range */ if (fam10h_mmconf_end) { printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); - subtract_range(range, RANGE_NUM, fam10h_mmconf_start, fam10h_mmconf_end); + subtract_range(range, RANGE_NUM, fam10h_mmconf_start, + fam10h_mmconf_end + 1); } /* mmio resource */ @@ -267,7 +270,8 @@ static int __init early_fill_mp_bus_info(void) /* we got a hole */ endx = fam10h_mmconf_start - 1; update_res(info, start, endx, IORESOURCE_MEM, 0); - subtract_range(range, RANGE_NUM, start, endx); + subtract_range(range, RANGE_NUM, start, + endx + 1); printk(KERN_CONT " ==> [%llx, %llx]", start, endx); start = fam10h_mmconf_end + 1; changed = 1; @@ -284,7 +288,7 @@ static int __init early_fill_mp_bus_info(void) update_res(info, cap_resource(start), cap_resource(end), IORESOURCE_MEM, 1); - subtract_range(range, RANGE_NUM, start, end); + subtract_range(range, RANGE_NUM, start, end + 1); printk(KERN_CONT "\n"); } @@ -299,7 +303,7 @@ static int __init early_fill_mp_bus_info(void) rdmsrl(address, val); end = (val & 0xffffff800000ULL); printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20); - subtract_range(range, RANGE_NUM, 1ULL<<32, end - 1); + subtract_range(range, RANGE_NUM, 1ULL<<32, end); } /* @@ -319,7 +323,7 @@ static int __init early_fill_mp_bus_info(void) continue; update_res(info, cap_resource(range[i].start), - cap_resource(range[i].end), + cap_resource(range[i].end - 1), IORESOURCE_MEM, 1); } } diff --git a/kernel/range.c b/kernel/range.c index 71e0021281f..74e2e611492 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -13,7 +13,7 @@ int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) { - if (start > end) + if (start >= end) return nr_range; /* Out of slots: */ @@ -33,7 +33,7 @@ int add_range_with_merge(struct range *range, int az, int nr_range, { int i; - if (start > end) + if (start >= end) return nr_range; /* Try to merge it with old one: */ @@ -46,7 +46,7 @@ int add_range_with_merge(struct range *range, int az, int nr_range, common_start = max(range[i].start, start); common_end = min(range[i].end, end); - if (common_start > common_end + 1) + if (common_start > common_end) continue; final_start = min(range[i].start, start); @@ -65,7 +65,7 @@ void subtract_range(struct range *range, int az, u64 start, u64 end) { int i, j; - if (start > end) + if (start >= end) return; for (j = 0; j < az; j++) { @@ -79,15 +79,15 @@ void subtract_range(struct range *range, int az, u64 start, u64 end) } if (start <= range[j].start && end < range[j].end && - range[j].start < end + 1) { - range[j].start = end + 1; + range[j].start < end) { + range[j].start = end; continue; } if (start > range[j].start && end >= range[j].end && - range[j].end > start - 1) { - range[j].end = start - 1; + range[j].end > start) { + range[j].end = start; continue; } @@ -99,11 +99,11 @@ void subtract_range(struct range *range, int az, u64 start, u64 end) } if (i < az) { range[i].end = range[j].end; - range[i].start = end + 1; + range[i].start = end; } else { printk(KERN_ERR "run of slot in ranges\n"); } - range[j].end = start - 1; + range[j].end = start; continue; } } -- cgit v1.2.3-70-g09d2 From 79c601695870ca2a9c0ba9949a97d2be78ec07b2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:14 -0800 Subject: x86: Print out RAM buffer information So we can check that early in the bootlog. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-11-git-send-email-yinghai@kernel.org> Reviewed-by: Christoph Lameter Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a966b753e49..f406efeb4dc 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1429,6 +1429,8 @@ void __init e820_reserve_resources_late(void) end = MAX_RESOURCE_SIZE; if (start >= end) continue; + printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ", + start, end); reserve_region_with_split(&iomem_resource, start, end, "RAM buffer"); } -- cgit v1.2.3-70-g09d2 From 1842f90cc98625d4d9bf8f8b927f17705ceb4e9c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:15 -0800 Subject: x86: Call early_res_to_bootmem one time Simplify setup_node_mem: don't use bootmem from other node, instead just find_e820_area in early_node_mem. This keeps the boundary between early_res and boot mem more clear, and lets us only call early_res_to_bootmem() one time instead of for all nodes. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-12-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 1 + arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 3 +-- arch/x86/mm/numa_64.c | 62 ++++++++++++++++--------------------------------- 4 files changed, 22 insertions(+), 45 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3499b4fabc9..48cadbb1d28 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -967,6 +967,7 @@ void __init setup_arch(char **cmdline_p) #endif initmem_init(0, max_pfn, acpi, k8); + early_res_to_bootmem(0, max_low_pfn<> PAGE_SHIFT, 0, end_pfn); e820_register_active_regions(0, start_pfn, end_pfn); free_bootmem_with_active_regions(0, end_pfn); - early_res_to_bootmem(0, end_pfn<bdata = &bootmem_node_data[nodeid]; @@ -227,11 +234,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) * of alloc_bootmem, that could clash with reserved range */ bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); - nid = phys_to_nid(nodedata_phys); - if (nid == nodeid) - bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); - else - bootmap_start = roundup(start, PAGE_SIZE); + bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE); /* * SMP_CACHE_BYTES could be enough, but init_bootmem_node like * to use that to align to PAGE_SIZE @@ -239,18 +242,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) bootmap = early_node_mem(nodeid, bootmap_start, end, bootmap_pages<= end) { - /* - * only need to free it if it is from other node - * bootmem - */ - if (nid != nodeid) - free_bootmem(nodedata_phys, pgdat_size); - } + free_early(nodedata_phys, nodedata_phys + pgdat_size); node_data[nodeid] = NULL; return; } bootmap_start = __pa(bootmap); + reserve_early(bootmap_start, bootmap_start+(bootmap_pages<> PAGE_SHIFT, @@ -259,31 +257,11 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", bootmap_start, bootmap_start + bootmap_size - 1, bootmap_pages); - - free_bootmem_with_active_regions(nodeid, end); - - /* - * convert early reserve to bootmem reserve earlier - * otherwise early_node_mem could use early reserved mem - * on previous node - */ - early_res_to_bootmem(start, end); - - /* - * in some case early_node_mem could use alloc_bootmem - * to get range on other node, don't reserve that again - */ - if (nid != nodeid) - printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); - else - reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, - pgdat_size, BOOTMEM_DEFAULT); nid = phys_to_nid(bootmap_start); if (nid != nodeid) printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); - else - reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, - bootmap_pages< Date: Wed, 10 Feb 2010 01:20:16 -0800 Subject: x86: Introduce max_early_res and early_res_count To prepare allocate early res array from fine_e820_area. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-13-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index f406efeb4dc..7053f4adb8e 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -732,14 +732,18 @@ core_initcall(e820_mark_nvs_memory); /* * Early reserved memory areas. */ -#define MAX_EARLY_RES 32 +/* + * need to make sure this one is bigger enough before + * find_e820_area could be used + */ +#define MAX_EARLY_RES_X 32 struct early_res { u64 start, end; - char name[16]; + char name[15]; char overlap_ok; }; -static struct early_res early_res[MAX_EARLY_RES] __initdata = { +static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata = { { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */ #if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE) /* @@ -753,12 +757,22 @@ static struct early_res early_res[MAX_EARLY_RES] __initdata = { {} }; +static int max_early_res __initdata = MAX_EARLY_RES_X; +static struct early_res *early_res __initdata = &early_res_x[0]; +static int early_res_count __initdata = +#ifdef CONFIG_X86_32 + 2 +#else + 1 +#endif + ; + static int __init find_overlapped_early(u64 start, u64 end) { int i; struct early_res *r; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + for (i = 0; i < max_early_res && early_res[i].end; i++) { r = &early_res[i]; if (end > r->start && start < r->end) break; @@ -776,13 +790,14 @@ static void __init drop_range(int i) { int j; - for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) + for (j = i + 1; j < max_early_res && early_res[j].end; j++) ; memmove(&early_res[i], &early_res[i + 1], (j - 1 - i) * sizeof(struct early_res)); early_res[j - 1].end = 0; + early_res_count--; } /* @@ -801,9 +816,9 @@ static void __init drop_overlaps_that_are_ok(u64 start, u64 end) struct early_res *r; u64 lower_start, lower_end; u64 upper_start, upper_end; - char name[16]; + char name[15]; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + for (i = 0; i < max_early_res && early_res[i].end; i++) { r = &early_res[i]; /* Continue past non-overlapping ranges */ @@ -859,7 +874,7 @@ static void __init __reserve_early(u64 start, u64 end, char *name, struct early_res *r; i = find_overlapped_early(start, end); - if (i >= MAX_EARLY_RES) + if (i >= max_early_res) panic("Too many early reservations"); r = &early_res[i]; if (r->end) @@ -872,6 +887,7 @@ static void __init __reserve_early(u64 start, u64 end, char *name, r->overlap_ok = overlap_ok; if (name) strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; } /* @@ -924,7 +940,7 @@ void __init free_early(u64 start, u64 end) i = find_overlapped_early(start, end); r = &early_res[i]; - if (i >= MAX_EARLY_RES || r->end != end || r->start != start) + if (i >= max_early_res || r->end != end || r->start != start) panic("free_early on not reserved area: %llx-%llx!", start, end - 1); @@ -935,14 +951,15 @@ void __init early_res_to_bootmem(u64 start, u64 end) { int i, count; u64 final_start, final_end; + int idx = 0; count = 0; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) + for (i = 0; i < max_early_res && early_res[i].end; i++) count++; - printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n", - count, start, end); - for (i = 0; i < count; i++) { + printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", + count - idx, max_early_res, start, end); + for (i = idx; i < count; i++) { struct early_res *r = &early_res[i]; printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, r->start, r->end, r->name); @@ -969,7 +986,7 @@ static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) again: i = find_overlapped_early(addr, addr + size); r = &early_res[i]; - if (i < MAX_EARLY_RES && r->end) { + if (i < max_early_res && r->end) { *addrp = addr = round_up(r->end, align); changed = 1; goto again; @@ -986,7 +1003,7 @@ static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) int changed = 0; again: last = addr + size; - for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + for (i = 0; i < max_early_res && early_res[i].end; i++) { struct early_res *r = &early_res[i]; if (last > r->start && addr < r->start) { size = r->start - addr; -- cgit v1.2.3-70-g09d2 From 28b1c57d3c1f8df69c958f2ae7b9e4b67538ff4d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:17 -0800 Subject: x86: Dynamically increase early_res array size Use early_res_count to track the num, and use find_e820 to get a new buffer, then copy from the old to the new one. Also, clear early_res to prevent later invalid usage. -v2 _check_and_double_early_res should take new start Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-14-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7053f4adb8e..e09c18c8f3c 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -916,6 +916,48 @@ void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) __reserve_early(start, end, name, 1); } +static void __init __check_and_double_early_res(u64 start) +{ + u64 end, size, mem; + struct early_res *new; + + /* do we have enough slots left ? */ + if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) + return; + + /* double it */ + end = max_pfn_mapped << PAGE_SHIFT; + size = sizeof(struct early_res) * max_early_res * 2; + mem = find_e820_area(start, end, size, sizeof(struct early_res)); + + if (mem == -1ULL) + panic("can not find more space for early_res array"); + + new = __va(mem); + /* save the first one for own */ + new[0].start = mem; + new[0].end = mem + size; + new[0].overlap_ok = 0; + /* copy old to new */ + if (early_res == early_res_x) { + memcpy(&new[1], &early_res[0], + sizeof(struct early_res) * max_early_res); + memset(&new[max_early_res+1], 0, + sizeof(struct early_res) * (max_early_res - 1)); + early_res_count++; + } else { + memcpy(&new[1], &early_res[1], + sizeof(struct early_res) * (max_early_res - 1)); + memset(&new[max_early_res], 0, + sizeof(struct early_res) * max_early_res); + } + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = new; + max_early_res *= 2; + printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", + max_early_res, mem, mem + size - 1); +} + /* * Most early reservations come here. * @@ -929,6 +971,8 @@ void __init reserve_early(u64 start, u64 end, char *name) if (start >= end) return; + __check_and_double_early_res(end); + drop_overlaps_that_are_ok(start, end); __reserve_early(start, end, name, 0); } @@ -957,6 +1001,10 @@ void __init early_res_to_bootmem(u64 start, u64 end) for (i = 0; i < max_early_res && early_res[i].end; i++) count++; + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", count - idx, max_early_res, start, end); for (i = idx; i < count; i++) { @@ -974,6 +1022,11 @@ void __init early_res_to_bootmem(u64 start, u64 end) reserve_bootmem_generic(final_start, final_end - final_start, BOOTMEM_DEFAULT); } + /* clear them */ + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + early_res_count = 0; } /* Check for already reserved areas */ -- cgit v1.2.3-70-g09d2 From c252a5bb1f57afb1e336d68085217727ca7b2134 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:19 -0800 Subject: x86: Only call dma32_reserve_bootmem 64bit !CONFIG_NUMA 64bit NUMA already make enough space under 4G with new early_node_mem. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-16-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pci.h | 2 ++ arch/x86/include/asm/pci_64.h | 2 -- arch/x86/kernel/pci-dma.c | 13 ++++++++++--- arch/x86/kernel/setup.c | 7 ------- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index ada8c201d51..b4a00dd4eed 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -124,6 +124,8 @@ extern void pci_iommu_alloc(void); #include "pci_64.h" #endif +void dma32_reserve_bootmem(void); + /* implement the pci_ DMA API in terms of the generic device dma_ one */ #include diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h index ae5e40f67da..fe15cfb21b9 100644 --- a/arch/x86/include/asm/pci_64.h +++ b/arch/x86/include/asm/pci_64.h @@ -22,8 +22,6 @@ extern int (*pci_config_read)(int seg, int bus, int dev, int fn, extern int (*pci_config_write)(int seg, int bus, int dev, int fn, int reg, int len, u32 value); -extern void dma32_reserve_bootmem(void); - #endif /* __KERNEL__ */ #endif /* _ASM_X86_PCI_64_H */ diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 75e14e21f61..1aa966c565f 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -65,7 +65,7 @@ int dma_set_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_mask); -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA) static __initdata void *dma32_bootmem_ptr; static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); @@ -116,14 +116,21 @@ static void __init dma32_free_bootmem(void) dma32_bootmem_ptr = NULL; dma32_bootmem_size = 0; } +#else +void __init dma32_reserve_bootmem(void) +{ +} +static void __init dma32_free_bootmem(void) +{ +} + #endif void __init pci_iommu_alloc(void) { -#ifdef CONFIG_X86_64 /* free the range so iommu could get some range less than 4G */ dma32_free_bootmem(); -#endif + if (pci_swiotlb_detect()) goto out; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 48cadbb1d28..ea4141b4851 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -969,14 +969,7 @@ void __init setup_arch(char **cmdline_p) initmem_init(0, max_pfn, acpi, k8); early_res_to_bootmem(0, max_low_pfn< Date: Thu, 11 Feb 2010 11:50:59 -0800 Subject: x86, ptrace: regset extensions to support xstate Add the xstate regset support which helps extend the kernel ptrace and the core-dump interfaces to support AVX state etc. This regset interface is designed to support all the future state that gets supported using xsave/xrstor infrastructure. Looking at the memory layout saved by "xsave", one can't say which state is represented in the memory layout. This is because if a particular state is in init state, in the xsave hdr it can be represented by bit '0'. And hence we can't really say by the xsave header wether a state is in init state or the state is not saved in the memory layout. And hence the xsave memory layout available through this regset interface uses SW usable bytes [464..511] to convey what state is represented in the memory layout. First 8 bytes of the sw_usable_bytes[464..467] will be set to OS enabled xstate mask(which is same as the 64bit mask returned by the xgetbv's xCR0). The note NT_X86_XSTATE represents the extended state information in the core file, using the above mentioned memory layout. Signed-off-by: Suresh Siddha LKML-Reference: <20100211195614.802495327@sbs-t61.sc.intel.com> Signed-off-by: Hongjiu Lu Cc: Roland McGrath Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i387.h | 12 +++++-- arch/x86/include/asm/user.h | 58 +++++++++++++++++++++++++++++++ arch/x86/include/asm/xsave.h | 2 ++ arch/x86/kernel/i387.c | 83 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/ptrace.c | 34 ++++++++++++++++-- arch/x86/kernel/xsave.c | 1 + include/linux/elf.h | 1 + 7 files changed, 187 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index ebfb8a9e11f..da293092450 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -33,8 +33,16 @@ extern void init_thread_xstate(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern user_regset_active_fn fpregs_active, xfpregs_active; -extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get; -extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set; +extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, + xstateregs_get; +extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, + xstateregs_set; + +/* + * xstateregs_active == fpregs_active. Please refer to the comment + * at the definition of fpregs_active. + */ +#define xstateregs_active fpregs_active extern struct _fpx_sw_bytes fx_sw_reserved; #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h index 999873b22e7..24532c7da3d 100644 --- a/arch/x86/include/asm/user.h +++ b/arch/x86/include/asm/user.h @@ -1,5 +1,63 @@ +#ifndef _ASM_X86_USER_H +#define _ASM_X86_USER_H + #ifdef CONFIG_X86_32 # include "user_32.h" #else # include "user_64.h" #endif + +#include + +struct user_ymmh_regs { + /* 16 * 16 bytes for each YMMH-reg */ + __u32 ymmh_space[64]; +}; + +struct user_xsave_hdr { + __u64 xstate_bv; + __u64 reserved1[2]; + __u64 reserved2[5]; +}; + +/* + * The structure layout of user_xstateregs, used for exporting the + * extended register state through ptrace and core-dump (NT_X86_XSTATE note) + * interfaces will be same as the memory layout of xsave used by the processor + * (except for the bytes 464..511, which can be used by the software) and hence + * the size of this structure varies depending on the features supported by the + * processor and OS. The size of the structure that users need to use can be + * obtained by doing: + * cpuid_count(0xd, 0, &eax, &ptrace_xstateregs_struct_size, &ecx, &edx); + * i.e., cpuid.(eax=0xd,ecx=0).ebx will be the size that user (debuggers, etc.) + * need to use. + * + * For now, only the first 8 bytes of the software usable bytes[464..471] will + * be used and will be set to OS enabled xstate mask (which is same as the + * 64bit mask returned by the xgetbv's xCR0). Users (analyzing core dump + * remotely, etc.) can use this mask as well as the mask saved in the + * xstate_hdr bytes and interpret what states the processor/OS supports + * and what states are in modified/initialized conditions for the + * particular process/thread. + * + * Also when the user modifies certain state FP/SSE/etc through the + * ptrace interface, they must ensure that the xsave_hdr.xstate_bv + * bytes[512..519] of the memory layout are updated correspondingly. + * i.e., for example when FP state is modified to a non-init state, + * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to + * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc. + */ +#define USER_XSTATE_FX_SW_WORDS 6 +#define USER_XSTATE_XCR0_WORD 0 + +struct user_xstateregs { + struct { + __u64 fpx_space[58]; + __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS]; + } i387; + struct user_xsave_hdr xsave_hdr; + struct user_ymmh_regs ymmh; + /* further processor state extensions go here */ +}; + +#endif /* _ASM_X86_USER_H */ diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 727acc15234..ddc04ccad03 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -27,9 +27,11 @@ extern unsigned int xstate_size; extern u64 pcntxt_mask; extern struct xsave_struct *init_xstate_buf; +extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void xsave_cntxt_init(void); extern void xsave_init(void); +extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); extern int init_fpu(struct task_struct *child); extern int check_for_xstate(struct i387_fxsave_struct __user *buf, void __user *fpstate, diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index f2f8540a7f3..7a8a193b514 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -164,6 +164,11 @@ int init_fpu(struct task_struct *tsk) return 0; } +/* + * The xstateregs_active() routine is the same as the fpregs_active() routine, + * as the "regset->n" for the xstate regset will be updated based on the feature + * capabilites supported by the xsave. + */ int fpregs_active(struct task_struct *target, const struct user_regset *regset) { return tsk_used_math(target) ? regset->n : 0; @@ -224,6 +229,84 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, return ret; } +int xstateregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + /* + * First copy the fxsave bytes 0..463. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.xstate->xsave, 0, + offsetof(struct user_xstateregs, + i387.xstate_fx_sw)); + if (ret) + return ret; + + /* + * Copy the 48bytes defined by software. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + xstate_fx_sw_bytes, + offsetof(struct user_xstateregs, + i387.xstate_fx_sw), + offsetof(struct user_xstateregs, + xsave_hdr)); + if (ret) + return ret; + + /* + * Copy the rest of xstate memory layout. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.xstate->xsave.xsave_hdr, + offsetof(struct user_xstateregs, + xsave_hdr), -1); + return ret; +} + +int xstateregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct xsave_hdr_struct *xsave_hdr; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.xstate->xsave, 0, -1); + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; + + xsave_hdr = &target->thread.xstate->xsave.xsave_hdr; + + xsave_hdr->xstate_bv &= pcntxt_mask; + /* + * These bits must be zero. + */ + xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + + return ret; +} + #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION /* diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 017d937639f..16433a59b39 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -48,6 +48,7 @@ enum x86_regset { REGSET_FP, REGSET_XFP, REGSET_IOPERM64 = REGSET_XFP, + REGSET_XSTATE, REGSET_TLS, REGSET_IOPERM32, }; @@ -1584,7 +1585,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, #ifdef CONFIG_X86_64 -static const struct user_regset x86_64_regsets[] = { +static struct user_regset x86_64_regsets[] __read_mostly = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), @@ -1597,6 +1598,12 @@ static const struct user_regset x86_64_regsets[] = { .size = sizeof(long), .align = sizeof(long), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .get = xstateregs_get, + .set = xstateregs_set + }, [REGSET_IOPERM64] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_LONGS, @@ -1622,7 +1629,7 @@ static const struct user_regset_view user_x86_64_view = { #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION -static const struct user_regset x86_32_regsets[] = { +static struct user_regset x86_32_regsets[] __read_mostly = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), @@ -1641,6 +1648,12 @@ static const struct user_regset x86_32_regsets[] = { .size = sizeof(u32), .align = sizeof(u32), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .get = xstateregs_get, + .set = xstateregs_set + }, [REGSET_TLS] = { .core_note_type = NT_386_TLS, .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, @@ -1663,6 +1676,23 @@ static const struct user_regset_view user_x86_32_view = { }; #endif +/* + * This represents bytes 464..511 in the memory layout exported through + * the REGSET_XSTATE interface. + */ +u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; + +void update_regset_xstate_info(unsigned int size, u64 xstate_mask) +{ +#ifdef CONFIG_X86_64 + x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif + xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; +} + const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index c5ee17e8c6d..782c3a362ec 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -337,6 +337,7 @@ void __ref xsave_cntxt_init(void) cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); xstate_size = ebx; + update_regset_xstate_info(xstate_size, pcntxt_mask); prepare_fx_sw_frame(); setup_xstate_init(); diff --git a/include/linux/elf.h b/include/linux/elf.h index 0cc4d55151b..a8c4af073ce 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -361,6 +361,7 @@ typedef struct elf64_shdr { #define NT_PPC_VSX 0x102 /* PowerPC VSX registers */ #define NT_386_TLS 0x200 /* i386 TLS slots (struct user_desc) */ #define NT_386_IOPERM 0x201 /* x86 io permission bitmap (1=deny) */ +#define NT_X86_XSTATE 0x202 /* x86 extended state using xsave */ #define NT_S390_HIGH_GPRS 0x300 /* s390 upper register halves */ -- cgit v1.2.3-70-g09d2 From 08677214e318297f228237be0042aac754f48f1d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:20 -0800 Subject: x86: Make 64 bit use early_res instead of bootmem before slab Finally we can use early_res to replace bootmem for x86_64 now. Still can use CONFIG_NO_BOOTMEM to enable it or not. -v2: fix 32bit compiling about MAX_DMA32_PFN -v3: folded bug fix from LKML message below Signed-off-by: Yinghai Lu LKML-Reference: <4B747239.4070907@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 13 +++ arch/x86/include/asm/e820.h | 6 ++ arch/x86/kernel/e820.c | 159 +++++++++++++++++++++++++++++++++--- arch/x86/kernel/setup.c | 2 + arch/x86/mm/init_64.c | 4 + arch/x86/mm/numa_64.c | 20 +++-- include/linux/bootmem.h | 7 ++ include/linux/mm.h | 5 ++ include/linux/mmzone.h | 2 + mm/bootmem.c | 195 +++++++++++++++++++++++++++++++++++++++++++- mm/page_alloc.c | 59 +++++++++++++- mm/percpu.c | 3 + mm/sparse-vmemmap.c | 2 +- 13 files changed, 454 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index eb4092568f9..95439843ceb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -568,6 +568,19 @@ config PARAVIRT_DEBUG Enable to debug paravirt_ops internals. Specifically, BUG if a paravirt_op is missing when it is called. +config NO_BOOTMEM + default y + bool "Disable Bootmem code" + depends on X86_64 + ---help--- + Use early_res directly instead of bootmem before slab is ready. + - allocator (buddy) [generic] + - early allocator (bootmem) [generic] + - very early allocator (reserve_early*()) [x86] + - very very early allocator (early brk model) [x86] + So reduce one layer between early allocator to final allocator + + config MEMTEST bool "Memtest" ---help--- diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 761249e396f..7d72e5fb700 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -117,6 +117,12 @@ extern void free_early(u64 start, u64 end); extern void early_res_to_bootmem(u64 start, u64 end); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); +void reserve_early_without_check(u64 start, u64 end, char *name); +u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, + u64 size, u64 align); +#include +int get_free_all_memory_range(struct range **rangep, int nodeid); + extern unsigned long e820_end_of_ram_pfn(void); extern unsigned long e820_end_of_low_ram_pfn(void); extern int e820_find_active_region(const struct e820entry *ei, diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index e09c18c8f3c..90a85295f33 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -977,6 +977,25 @@ void __init reserve_early(u64 start, u64 end, char *name) __reserve_early(start, end, name, 0); } +void __init reserve_early_without_check(u64 start, u64 end, char *name) +{ + struct early_res *r; + + if (start >= end) + return; + + __check_and_double_early_res(end); + + r = &early_res[early_res_count]; + + r->start = start; + r->end = end; + r->overlap_ok = 0; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; +} + void __init free_early(u64 start, u64 end) { struct early_res *r; @@ -991,6 +1010,94 @@ void __init free_early(u64 start, u64 end) drop_range(i); } +#ifdef CONFIG_NO_BOOTMEM +static void __init subtract_early_res(struct range *range, int az) +{ + int i, count; + u64 final_start, final_end; + int idx = 0; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + +#if 1 + printk(KERN_INFO "Subtract (%d early reservations)\n", count); +#endif + for (i = idx; i < count; i++) { + struct early_res *r = &early_res[i]; +#if 0 + printk(KERN_INFO " #%d [%010llx - %010llx] %15s", i, + r->start, r->end, r->name); +#endif + final_start = PFN_DOWN(r->start); + final_end = PFN_UP(r->end); + if (final_start >= final_end) { +#if 0 + printk(KERN_CONT "\n"); +#endif + continue; + } +#if 0 + printk(KERN_CONT " subtract pfn [%010llx - %010llx]\n", + final_start, final_end); +#endif + subtract_range(range, az, final_start, final_end); + } + +} + +int __init get_free_all_memory_range(struct range **rangep, int nodeid) +{ + int i, count; + u64 start = 0, end; + u64 size; + u64 mem; + struct range *range; + int nr_range; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + count *= 2; + + size = sizeof(struct range) * count; +#ifdef MAX_DMA32_PFN + if (max_pfn_mapped > MAX_DMA32_PFN) + start = MAX_DMA32_PFN << PAGE_SHIFT; +#endif + end = max_pfn_mapped << PAGE_SHIFT; + mem = find_e820_area(start, end, size, sizeof(struct range)); + if (mem == -1ULL) + panic("can not find more space for range free"); + + range = __va(mem); + /* use early_node_map[] and early_res to get range array at first */ + memset(range, 0, size); + nr_range = 0; + + /* need to go over early_node_map to find out good range for node */ + nr_range = add_from_early_node_map(range, count, nr_range, nodeid); + subtract_early_res(range, count); + nr_range = clean_sort_range(range, count); + + /* need to clear it ? */ + if (nodeid == MAX_NUMNODES) { + memset(&early_res[0], 0, + sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + } + + *rangep = range; + return nr_range; +} +#else void __init early_res_to_bootmem(u64 start, u64 end) { int i, count; @@ -1028,6 +1135,7 @@ void __init early_res_to_bootmem(u64 start, u64 end) max_early_res = 0; early_res_count = 0; } +#endif /* Check for already reserved areas */ static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) @@ -1081,6 +1189,35 @@ again: return changed; } +/* + * Find a free area with specified alignment in a specific range. + * only with the area.between start to end is active range from early_node_map + * so they are good as RAM + */ +u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, + u64 size, u64 align) +{ + u64 addr, last; + + addr = round_up(ei_start, align); + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + goto out; + while (bad_addr(&addr, size, align) && addr+size <= ei_last) + ; + last = addr + size; + if (last > ei_last) + goto out; + if (last > end) + goto out; + + return addr; + +out: + return -1ULL; +} + /* * Find a free area with specified alignment in a specific range. */ @@ -1090,24 +1227,20 @@ u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; - u64 addr, last; - u64 ei_last; + u64 addr; + u64 ei_start, ei_last; if (ei->type != E820_RAM) continue; - addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - while (bad_addr(&addr, size, align) && addr+size <= ei_last) - ; - last = addr + size; - if (last > ei_last) - continue; - if (last > end) + ei_start = ei->addr; + addr = find_early_area(ei_start, ei_last, start, end, + size, align); + + if (addr == -1ULL) continue; + return addr; } return -1ULL; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ea4141b4851..d49e168bda8 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -967,7 +967,9 @@ void __init setup_arch(char **cmdline_p) #endif initmem_init(0, max_pfn, acpi, k8); +#ifndef CONFIG_NO_BOOTMEM early_res_to_bootmem(0, max_low_pfn<> PAGE_SHIFT; @@ -235,10 +237,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); - NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; + NODE_DATA(nodeid)->node_id = nodeid; NODE_DATA(nodeid)->node_start_pfn = start_pfn; NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; +#ifndef CONFIG_NO_BOOTMEM + NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; + /* * Find a place for the bootmem map * nodedata_phys could be on other nodes by alloc_bootmem, @@ -275,6 +280,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); free_bootmem_with_active_regions(nodeid, end); +#endif node_set_online(nodeid); } @@ -733,6 +739,10 @@ unsigned long __init numa_free_all_bootmem(void) for_each_online_node(i) pages += free_all_bootmem_node(NODE_DATA(i)); +#ifdef CONFIG_NO_BOOTMEM + pages += free_all_memory_core_early(MAX_NUMNODES); +#endif + return pages; } diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index b10ec49ee2d..266ab929123 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -23,6 +23,7 @@ extern unsigned long max_pfn; extern unsigned long saved_max_pfn; #endif +#ifndef CONFIG_NO_BOOTMEM /* * node_bootmem_map is a map pointer - the bits represent all physical * memory pages (including holes) on the node. @@ -37,6 +38,7 @@ typedef struct bootmem_data { } bootmem_data_t; extern bootmem_data_t bootmem_node_data[]; +#endif extern unsigned long bootmem_bootmap_pages(unsigned long); @@ -46,6 +48,7 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat, unsigned long endpfn); extern unsigned long init_bootmem(unsigned long addr, unsigned long memend); +unsigned long free_all_memory_core_early(int nodeid); extern unsigned long free_all_bootmem_node(pg_data_t *pgdat); extern unsigned long free_all_bootmem(void); @@ -84,6 +87,10 @@ extern void *__alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); +void *__alloc_bootmem_node_high(pg_data_t *pgdat, + unsigned long size, + unsigned long align, + unsigned long goal); extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b2fa8593c6..f2c5b3cee8a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -12,6 +12,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -1049,6 +1050,10 @@ extern void get_pfn_range_for_nid(unsigned int nid, extern unsigned long find_min_pfn_with_active_regions(void); extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); +int add_from_early_node_map(struct range *range, int az, + int nr_range, int nid); +void *__alloc_memory_core_early(int nodeid, u64 size, u64 align, + u64 goal, u64 limit); typedef int (*work_fn_t)(unsigned long, unsigned long, void *); extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 30fe668c254..eae8387b600 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -620,7 +620,9 @@ typedef struct pglist_data { struct page_cgroup *node_page_cgroup; #endif #endif +#ifndef CONFIG_NO_BOOTMEM struct bootmem_data *bdata; +#endif #ifdef CONFIG_MEMORY_HOTPLUG /* * Must be held any time you expect node_start_pfn, node_present_pages diff --git a/mm/bootmem.c b/mm/bootmem.c index 7d1486875e1..d7c791ef003 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -32,6 +33,7 @@ unsigned long max_pfn; unsigned long saved_max_pfn; #endif +#ifndef CONFIG_NO_BOOTMEM bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); @@ -142,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) min_low_pfn = start; return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } - +#endif /* * free_bootmem_late - free bootmem pages directly to page allocator * @addr: starting address of the range @@ -167,6 +169,60 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) } } +#ifdef CONFIG_NO_BOOTMEM +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int i; + unsigned long start_aligned, end_aligned; + int order = ilog2(BITS_PER_LONG); + + start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); + end_aligned = end & ~(BITS_PER_LONG - 1); + + if (end_aligned <= start_aligned) { +#if 1 + printk(KERN_DEBUG " %lx - %lx\n", start, end); +#endif + for (i = start; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + return; + } + +#if 1 + printk(KERN_DEBUG " %lx %lx - %lx %lx\n", + start, start_aligned, end_aligned, end); +#endif + for (i = start; i < start_aligned; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) + __free_pages_bootmem(pfn_to_page(i), order); + + for (i = end_aligned; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); +} + +unsigned long __init free_all_memory_core_early(int nodeid) +{ + int i; + u64 start, end; + unsigned long count = 0; + struct range *range = NULL; + int nr_range; + + nr_range = get_free_all_memory_range(&range, nodeid); + + for (i = 0; i < nr_range; i++) { + start = range[i].start; + end = range[i].end; + count += end - start; + __free_pages_memory(start, end); + } + + return count; +} +#else static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { int aligned; @@ -227,6 +283,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } +#endif /** * free_all_bootmem_node - release a node's free pages to the buddy allocator @@ -237,7 +294,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); +#ifdef CONFIG_NO_BOOTMEM + /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ + return 0; +#else return free_all_bootmem_core(pgdat->bdata); +#endif } /** @@ -247,9 +309,14 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { +#ifdef CONFIG_NO_BOOTMEM + return free_all_memory_core_early(NODE_DATA(0)->node_id); +#else return free_all_bootmem_core(NODE_DATA(0)->bdata); +#endif } +#ifndef CONFIG_NO_BOOTMEM static void __init __free(bootmem_data_t *bdata, unsigned long sidx, unsigned long eidx) { @@ -344,6 +411,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, } BUG(); } +#endif /** * free_bootmem_node - mark a page range as usable @@ -358,6 +426,12 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { +#ifdef CONFIG_NO_BOOTMEM + free_early(physaddr, physaddr + size); +#if 0 + printk(KERN_DEBUG "free %lx %lx\n", physaddr, size); +#endif +#else unsigned long start, end; kmemleak_free_part(__va(physaddr), size); @@ -366,6 +440,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, end = PFN_DOWN(physaddr + size); mark_bootmem_node(pgdat->bdata, start, end, 0, 0); +#endif } /** @@ -379,6 +454,12 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, */ void __init free_bootmem(unsigned long addr, unsigned long size) { +#ifdef CONFIG_NO_BOOTMEM + free_early(addr, addr + size); +#if 0 + printk(KERN_DEBUG "free %lx %lx\n", addr, size); +#endif +#else unsigned long start, end; kmemleak_free_part(__va(addr), size); @@ -387,6 +468,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size) end = PFN_DOWN(addr + size); mark_bootmem(start, end, 0, 0); +#endif } /** @@ -403,12 +485,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size) int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { +#ifdef CONFIG_NO_BOOTMEM + panic("no bootmem"); + return 0; +#else unsigned long start, end; start = PFN_DOWN(physaddr); end = PFN_UP(physaddr + size); return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); +#endif } /** @@ -424,14 +511,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) { +#ifdef CONFIG_NO_BOOTMEM + panic("no bootmem"); + return 0; +#else unsigned long start, end; start = PFN_DOWN(addr); end = PFN_UP(addr + size); return mark_bootmem(start, end, 1, flags); +#endif } +#ifndef CONFIG_NO_BOOTMEM static unsigned long __init align_idx(struct bootmem_data *bdata, unsigned long idx, unsigned long step) { @@ -582,12 +675,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, #endif return NULL; } +#endif static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { +#ifdef CONFIG_NO_BOOTMEM + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + +restart: + + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + + if (ptr) + return ptr; + + if (goal != 0) { + goal = 0; + goto restart; + } + + return NULL; +#else bootmem_data_t *bdata; void *region; @@ -613,6 +727,7 @@ restart: } return NULL; +#endif } /** @@ -631,7 +746,13 @@ restart: void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) { - return ___alloc_bootmem_nopanic(size, align, goal, 0); + unsigned long limit = 0; + +#ifdef CONFIG_NO_BOOTMEM + limit = -1UL; +#endif + + return ___alloc_bootmem_nopanic(size, align, goal, limit); } static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, @@ -665,9 +786,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { - return ___alloc_bootmem(size, align, goal, 0); + unsigned long limit = 0; + +#ifdef CONFIG_NO_BOOTMEM + limit = -1UL; +#endif + + return ___alloc_bootmem(size, align, goal, limit); } +#ifndef CONFIG_NO_BOOTMEM static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -684,6 +812,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, return ___alloc_bootmem(size, align, goal, limit); } +#endif /** * __alloc_bootmem_node - allocate boot memory from a specific node @@ -706,7 +835,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); +#ifdef CONFIG_NO_BOOTMEM + return __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); +#else return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); +#endif +} + +void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ +#ifdef MAX_DMA32_PFN + unsigned long end_pfn; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + /* update goal according ...MAX_DMA32_PFN */ + end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && + (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { + void *ptr; + unsigned long new_goal; + + new_goal = MAX_DMA32_PFN << PAGE_SHIFT; +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + new_goal, -1ULL); +#else + ptr = alloc_bootmem_core(pgdat->bdata, size, align, + new_goal, 0); +#endif + if (ptr) + return ptr; + } +#endif + + return __alloc_bootmem_node(pgdat, size, align, goal); + } #ifdef CONFIG_SPARSEMEM @@ -720,6 +888,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { +#ifdef CONFIG_NO_BOOTMEM + unsigned long pfn, goal, limit; + + pfn = section_nr_to_pfn(section_nr); + goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + + return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, + SMP_CACHE_BYTES, goal, limit); +#else bootmem_data_t *bdata; unsigned long pfn, goal, limit; @@ -729,6 +907,7 @@ void * __init alloc_bootmem_section(unsigned long size, bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); +#endif } #endif @@ -740,11 +919,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); +#else ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); +#endif if (ptr) return ptr; @@ -795,6 +979,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); +#ifdef CONFIG_NO_BOOTMEM + return __alloc_memory_core_early(pgdat->node_id, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); +#else return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); +#endif } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8deb9d0fd5b..78821a28e39 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3435,6 +3435,59 @@ void __init free_bootmem_with_active_regions(int nid, } } +int __init add_from_early_node_map(struct range *range, int az, + int nr_range, int nid) +{ + int i; + u64 start, end; + + /* need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid(i, nid) { + start = early_node_map[i].start_pfn; + end = early_node_map[i].end_pfn; + nr_range = add_range(range, az, nr_range, start, end); + } + return nr_range; +} + +void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + int i; + void *ptr; + + /* need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid(i, nid) { + u64 addr; + u64 ei_start, ei_last; + + ei_last = early_node_map[i].end_pfn; + ei_last <<= PAGE_SHIFT; + ei_start = early_node_map[i].start_pfn; + ei_start <<= PAGE_SHIFT; + addr = find_early_area(ei_start, ei_last, + goal, limit, size, align); + + if (addr == -1ULL) + continue; + +#if 0 + printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n", + nid, + ei_start, ei_last, goal, limit, size, + align, addr); +#endif + + ptr = phys_to_virt(addr); + memset(ptr, 0, size); + reserve_early_without_check(addr, addr + size, "BOOTMEM"); + return ptr; + } + + return NULL; +} + + void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; @@ -4467,7 +4520,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) } #ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; +struct pglist_data __refdata contig_page_data = { +#ifndef CONFIG_NO_BOOTMEM + .bdata = &bootmem_node_data[0] +#endif + }; EXPORT_SYMBOL(contig_page_data); #endif diff --git a/mm/percpu.c b/mm/percpu.c index 083e7c91e5f..841defeeef8 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1929,7 +1929,10 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, } /* copy and return the unused part */ memcpy(ptr, __per_cpu_load, ai->static_size); +#ifndef CONFIG_NO_BOOTMEM + /* fix partial free ! */ free_fn(ptr + size_sum, ai->unit_size - size_sum); +#endif } } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index d9714bdcb4a..9506c39942f 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -40,7 +40,7 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); + return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); } -- cgit v1.2.3-70-g09d2 From db8f77c889542b09457b8b97efb311343c99a75d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:23 -0800 Subject: x86: Move bios page reserve early to head32/64.c So prepare to make one more clean of early_res.c. -v2: don't need to reserve first page in early_res because we already mark that in e820 as reserved already. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-20-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 22 ++-------------------- arch/x86/kernel/head32.c | 10 ++++++++++ 2 files changed, 12 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 90a85295f33..4004f10285d 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -743,29 +743,11 @@ struct early_res { char name[15]; char overlap_ok; }; -static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata = { - { 0, PAGE_SIZE, "BIOS data page", 1 }, /* BIOS data page */ -#if defined(CONFIG_X86_32) && defined(CONFIG_X86_TRAMPOLINE) - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - { PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE", 1 }, -#endif - - {} -}; +static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; static int max_early_res __initdata = MAX_EARLY_RES_X; static struct early_res *early_res __initdata = &early_res_x[0]; -static int early_res_count __initdata = -#ifdef CONFIG_X86_32 - 2 -#else - 1 -#endif - ; +static int early_res_count __initdata; static int __init find_overlapped_early(u64 start, u64 end) { diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 5051b94c906..adedeef1ded 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -29,6 +29,16 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { +#ifdef CONFIG_X86_TRAMPOLINE + /* + * But first pinch a few for the stack/trampoline stuff + * FIXME: Don't need the extra page at 4K, but need to fix + * trampoline before removing it. (see the GDT stuff) + */ + reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, + "EX TRAMPOLINE"); +#endif + reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); #ifdef CONFIG_BLK_DEV_INITRD -- cgit v1.2.3-70-g09d2 From a678c2be75773e112f6d656a22a7f1645c4dbd6c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:24 -0800 Subject: x86: Separate early_res related code from e820.c ... to make e820.c smaller. -v2: fix 32bit compiling with MAX_DMA32_PFN Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-21-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/e820.h | 13 +- arch/x86/include/asm/early_res.h | 20 ++ arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/e820.c | 541 +-------------------------------------- arch/x86/kernel/early_res.c | 538 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 561 insertions(+), 553 deletions(-) create mode 100644 arch/x86/include/asm/early_res.h create mode 100644 arch/x86/kernel/early_res.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 7d72e5fb700..efad699a2c2 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -109,19 +109,8 @@ static inline void early_memtest(unsigned long start, unsigned long end) extern unsigned long end_user_pfn; -extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); -extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); -extern void reserve_early(u64 start, u64 end, char *name); -extern void reserve_early_overlap_ok(u64 start, u64 end, char *name); -extern void free_early(u64 start, u64 end); -extern void early_res_to_bootmem(u64 start, u64 end); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); - -void reserve_early_without_check(u64 start, u64 end, char *name); -u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, - u64 size, u64 align); -#include -int get_free_all_memory_range(struct range **rangep, int nodeid); +#include extern unsigned long e820_end_of_ram_pfn(void); extern unsigned long e820_end_of_low_ram_pfn(void); diff --git a/arch/x86/include/asm/early_res.h b/arch/x86/include/asm/early_res.h new file mode 100644 index 00000000000..2d43b166782 --- /dev/null +++ b/arch/x86/include/asm/early_res.h @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_EARLY_RES_H +#define _ASM_X86_EARLY_RES_H +#ifdef __KERNEL__ + +extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); +extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); +extern void reserve_early(u64 start, u64 end, char *name); +extern void reserve_early_overlap_ok(u64 start, u64 end, char *name); +extern void free_early(u64 start, u64 end); +extern void early_res_to_bootmem(u64 start, u64 end); + +void reserve_early_without_check(u64 start, u64 end, char *name); +u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, + u64 size, u64 align); +#include +int get_free_all_memory_range(struct range **rangep, int nodeid); + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_EARLY_RES_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d87f09bc5a5..f5fb9f0b627 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -38,7 +38,7 @@ obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o -obj-y += bootflag.o e820.o +obj-y += bootflag.o e820.o early_res.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 4004f10285d..82db4015604 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -12,21 +12,14 @@ #include #include #include -#include -#include -#include -#include -#include #include #include #include -#include -#include #include +#include #include #include -#include /* * The e820 map is the map that gets modified e.g. with command line parameters @@ -729,538 +722,6 @@ static int __init e820_mark_nvs_memory(void) core_initcall(e820_mark_nvs_memory); #endif -/* - * Early reserved memory areas. - */ -/* - * need to make sure this one is bigger enough before - * find_e820_area could be used - */ -#define MAX_EARLY_RES_X 32 - -struct early_res { - u64 start, end; - char name[15]; - char overlap_ok; -}; -static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; - -static int max_early_res __initdata = MAX_EARLY_RES_X; -static struct early_res *early_res __initdata = &early_res_x[0]; -static int early_res_count __initdata; - -static int __init find_overlapped_early(u64 start, u64 end) -{ - int i; - struct early_res *r; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - if (end > r->start && start < r->end) - break; - } - - return i; -} - -/* - * Drop the i-th range from the early reservation map, - * by copying any higher ranges down one over it, and - * clearing what had been the last slot. - */ -static void __init drop_range(int i) -{ - int j; - - for (j = i + 1; j < max_early_res && early_res[j].end; j++) - ; - - memmove(&early_res[i], &early_res[i + 1], - (j - 1 - i) * sizeof(struct early_res)); - - early_res[j - 1].end = 0; - early_res_count--; -} - -/* - * Split any existing ranges that: - * 1) are marked 'overlap_ok', and - * 2) overlap with the stated range [start, end) - * into whatever portion (if any) of the existing range is entirely - * below or entirely above the stated range. Drop the portion - * of the existing range that overlaps with the stated range, - * which will allow the caller of this routine to then add that - * stated range without conflicting with any existing range. - */ -static void __init drop_overlaps_that_are_ok(u64 start, u64 end) -{ - int i; - struct early_res *r; - u64 lower_start, lower_end; - u64 upper_start, upper_end; - char name[15]; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - - /* Continue past non-overlapping ranges */ - if (end <= r->start || start >= r->end) - continue; - - /* - * Leave non-ok overlaps as is; let caller - * panic "Overlapping early reservations" - * when it hits this overlap. - */ - if (!r->overlap_ok) - return; - - /* - * We have an ok overlap. We will drop it from the early - * reservation map, and add back in any non-overlapping - * portions (lower or upper) as separate, overlap_ok, - * non-overlapping ranges. - */ - - /* 1. Note any non-overlapping (lower or upper) ranges. */ - strncpy(name, r->name, sizeof(name) - 1); - - lower_start = lower_end = 0; - upper_start = upper_end = 0; - if (r->start < start) { - lower_start = r->start; - lower_end = start; - } - if (r->end > end) { - upper_start = end; - upper_end = r->end; - } - - /* 2. Drop the original ok overlapping range */ - drop_range(i); - - i--; /* resume for-loop on copied down entry */ - - /* 3. Add back in any non-overlapping ranges. */ - if (lower_end) - reserve_early_overlap_ok(lower_start, lower_end, name); - if (upper_end) - reserve_early_overlap_ok(upper_start, upper_end, name); - } -} - -static void __init __reserve_early(u64 start, u64 end, char *name, - int overlap_ok) -{ - int i; - struct early_res *r; - - i = find_overlapped_early(start, end); - if (i >= max_early_res) - panic("Too many early reservations"); - r = &early_res[i]; - if (r->end) - panic("Overlapping early reservations " - "%llx-%llx %s to %llx-%llx %s\n", - start, end - 1, name?name:"", r->start, - r->end - 1, r->name); - r->start = start; - r->end = end; - r->overlap_ok = overlap_ok; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -/* - * A few early reservtations come here. - * - * The 'overlap_ok' in the name of this routine does -not- mean it - * is ok for these reservations to overlap an earlier reservation. - * Rather it means that it is ok for subsequent reservations to - * overlap this one. - * - * Use this entry point to reserve early ranges when you are doing - * so out of "Paranoia", reserving perhaps more memory than you need, - * just in case, and don't mind a subsequent overlapping reservation - * that is known to be needed. - * - * The drop_overlaps_that_are_ok() call here isn't really needed. - * It would be needed if we had two colliding 'overlap_ok' - * reservations, so that the second such would not panic on the - * overlap with the first. We don't have any such as of this - * writing, but might as well tolerate such if it happens in - * the future. - */ -void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) -{ - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 1); -} - -static void __init __check_and_double_early_res(u64 start) -{ - u64 end, size, mem; - struct early_res *new; - - /* do we have enough slots left ? */ - if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) - return; - - /* double it */ - end = max_pfn_mapped << PAGE_SHIFT; - size = sizeof(struct early_res) * max_early_res * 2; - mem = find_e820_area(start, end, size, sizeof(struct early_res)); - - if (mem == -1ULL) - panic("can not find more space for early_res array"); - - new = __va(mem); - /* save the first one for own */ - new[0].start = mem; - new[0].end = mem + size; - new[0].overlap_ok = 0; - /* copy old to new */ - if (early_res == early_res_x) { - memcpy(&new[1], &early_res[0], - sizeof(struct early_res) * max_early_res); - memset(&new[max_early_res+1], 0, - sizeof(struct early_res) * (max_early_res - 1)); - early_res_count++; - } else { - memcpy(&new[1], &early_res[1], - sizeof(struct early_res) * (max_early_res - 1)); - memset(&new[max_early_res], 0, - sizeof(struct early_res) * max_early_res); - } - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = new; - max_early_res *= 2; - printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", - max_early_res, mem, mem + size - 1); -} - -/* - * Most early reservations come here. - * - * We first have drop_overlaps_that_are_ok() drop any pre-existing - * 'overlap_ok' ranges, so that we can then reserve this memory - * range without risk of panic'ing on an overlapping overlap_ok - * early reservation. - */ -void __init reserve_early(u64 start, u64 end, char *name) -{ - if (start >= end) - return; - - __check_and_double_early_res(end); - - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 0); -} - -void __init reserve_early_without_check(u64 start, u64 end, char *name) -{ - struct early_res *r; - - if (start >= end) - return; - - __check_and_double_early_res(end); - - r = &early_res[early_res_count]; - - r->start = start; - r->end = end; - r->overlap_ok = 0; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -void __init free_early(u64 start, u64 end) -{ - struct early_res *r; - int i; - - i = find_overlapped_early(start, end); - r = &early_res[i]; - if (i >= max_early_res || r->end != end || r->start != start) - panic("free_early on not reserved area: %llx-%llx!", - start, end - 1); - - drop_range(i); -} - -#ifdef CONFIG_NO_BOOTMEM -static void __init subtract_early_res(struct range *range, int az) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - -#if 1 - printk(KERN_INFO "Subtract (%d early reservations)\n", count); -#endif - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; -#if 0 - printk(KERN_INFO " #%d [%010llx - %010llx] %15s", i, - r->start, r->end, r->name); -#endif - final_start = PFN_DOWN(r->start); - final_end = PFN_UP(r->end); - if (final_start >= final_end) { -#if 0 - printk(KERN_CONT "\n"); -#endif - continue; - } -#if 0 - printk(KERN_CONT " subtract pfn [%010llx - %010llx]\n", - final_start, final_end); -#endif - subtract_range(range, az, final_start, final_end); - } - -} - -int __init get_free_all_memory_range(struct range **rangep, int nodeid) -{ - int i, count; - u64 start = 0, end; - u64 size; - u64 mem; - struct range *range; - int nr_range; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - count *= 2; - - size = sizeof(struct range) * count; -#ifdef MAX_DMA32_PFN - if (max_pfn_mapped > MAX_DMA32_PFN) - start = MAX_DMA32_PFN << PAGE_SHIFT; -#endif - end = max_pfn_mapped << PAGE_SHIFT; - mem = find_e820_area(start, end, size, sizeof(struct range)); - if (mem == -1ULL) - panic("can not find more space for range free"); - - range = __va(mem); - /* use early_node_map[] and early_res to get range array at first */ - memset(range, 0, size); - nr_range = 0; - - /* need to go over early_node_map to find out good range for node */ - nr_range = add_from_early_node_map(range, count, nr_range, nodeid); - subtract_early_res(range, count); - nr_range = clean_sort_range(range, count); - - /* need to clear it ? */ - if (nodeid == MAX_NUMNODES) { - memset(&early_res[0], 0, - sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - } - - *rangep = range; - return nr_range; -} -#else -void __init early_res_to_bootmem(u64 start, u64 end) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - - printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", - count - idx, max_early_res, start, end); - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; - printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, - r->start, r->end, r->name); - final_start = max(start, r->start); - final_end = min(end, r->end); - if (final_start >= final_end) { - printk(KERN_CONT "\n"); - continue; - } - printk(KERN_CONT " ==> [%010llx - %010llx]\n", - final_start, final_end); - reserve_bootmem_generic(final_start, final_end - final_start, - BOOTMEM_DEFAULT); - } - /* clear them */ - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - early_res_count = 0; -} -#endif - -/* Check for already reserved areas */ -static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) -{ - int i; - u64 addr = *addrp; - int changed = 0; - struct early_res *r; -again: - i = find_overlapped_early(addr, addr + size); - r = &early_res[i]; - if (i < max_early_res && r->end) { - *addrp = addr = round_up(r->end, align); - changed = 1; - goto again; - } - return changed; -} - -/* Check for already reserved areas */ -static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) -{ - int i; - u64 addr = *addrp, last; - u64 size = *sizep; - int changed = 0; -again: - last = addr + size; - for (i = 0; i < max_early_res && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - if (last > r->start && addr < r->start) { - size = r->start - addr; - changed = 1; - goto again; - } - if (last > r->end && addr < r->end) { - addr = round_up(r->end, align); - size = last - addr; - changed = 1; - goto again; - } - if (last <= r->end && addr >= r->start) { - (*sizep)++; - return 0; - } - } - if (changed) { - *addrp = addr; - *sizep = size; - } - return changed; -} - -/* - * Find a free area with specified alignment in a specific range. - * only with the area.between start to end is active range from early_node_map - * so they are good as RAM - */ -u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, - u64 size, u64 align) -{ - u64 addr, last; - - addr = round_up(ei_start, align); - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - goto out; - while (bad_addr(&addr, size, align) && addr+size <= ei_last) - ; - last = addr + size; - if (last > ei_last) - goto out; - if (last > end) - goto out; - - return addr; - -out: - return -1ULL; -} - -/* - * Find a free area with specified alignment in a specific range. - */ -u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 addr; - u64 ei_start, ei_last; - - if (ei->type != E820_RAM) - continue; - - ei_last = ei->addr + ei->size; - ei_start = ei->addr; - addr = find_early_area(ei_start, ei_last, start, end, - size, align); - - if (addr == -1ULL) - continue; - - return addr; - } - return -1ULL; -} - -/* - * Find next free range after *start - */ -u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 addr, last; - u64 ei_last; - - if (ei->type != E820_RAM) - continue; - addr = round_up(ei->addr, align); - ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - *sizep = ei_last - addr; - while (bad_addr_size(&addr, sizep, align) && - addr + *sizep <= ei_last) - ; - last = addr + *sizep; - if (last > ei_last) - continue; - return addr; - } - - return -1ULL; -} - /* * pre allocated 4k and reserved it in e820 */ diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c new file mode 100644 index 00000000000..1cf2c2f9ea6 --- /dev/null +++ b/arch/x86/kernel/early_res.c @@ -0,0 +1,538 @@ +/* + * early_res, could be used to replace bootmem + */ +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Early reserved memory areas. + */ +/* + * need to make sure this one is bigger enough before + * find_e820_area could be used + */ +#define MAX_EARLY_RES_X 32 + +struct early_res { + u64 start, end; + char name[15]; + char overlap_ok; +}; +static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; + +static int max_early_res __initdata = MAX_EARLY_RES_X; +static struct early_res *early_res __initdata = &early_res_x[0]; +static int early_res_count __initdata; + +static int __init find_overlapped_early(u64 start, u64 end) +{ + int i; + struct early_res *r; + + for (i = 0; i < max_early_res && early_res[i].end; i++) { + r = &early_res[i]; + if (end > r->start && start < r->end) + break; + } + + return i; +} + +/* + * Drop the i-th range from the early reservation map, + * by copying any higher ranges down one over it, and + * clearing what had been the last slot. + */ +static void __init drop_range(int i) +{ + int j; + + for (j = i + 1; j < max_early_res && early_res[j].end; j++) + ; + + memmove(&early_res[i], &early_res[i + 1], + (j - 1 - i) * sizeof(struct early_res)); + + early_res[j - 1].end = 0; + early_res_count--; +} + +/* + * Split any existing ranges that: + * 1) are marked 'overlap_ok', and + * 2) overlap with the stated range [start, end) + * into whatever portion (if any) of the existing range is entirely + * below or entirely above the stated range. Drop the portion + * of the existing range that overlaps with the stated range, + * which will allow the caller of this routine to then add that + * stated range without conflicting with any existing range. + */ +static void __init drop_overlaps_that_are_ok(u64 start, u64 end) +{ + int i; + struct early_res *r; + u64 lower_start, lower_end; + u64 upper_start, upper_end; + char name[15]; + + for (i = 0; i < max_early_res && early_res[i].end; i++) { + r = &early_res[i]; + + /* Continue past non-overlapping ranges */ + if (end <= r->start || start >= r->end) + continue; + + /* + * Leave non-ok overlaps as is; let caller + * panic "Overlapping early reservations" + * when it hits this overlap. + */ + if (!r->overlap_ok) + return; + + /* + * We have an ok overlap. We will drop it from the early + * reservation map, and add back in any non-overlapping + * portions (lower or upper) as separate, overlap_ok, + * non-overlapping ranges. + */ + + /* 1. Note any non-overlapping (lower or upper) ranges. */ + strncpy(name, r->name, sizeof(name) - 1); + + lower_start = lower_end = 0; + upper_start = upper_end = 0; + if (r->start < start) { + lower_start = r->start; + lower_end = start; + } + if (r->end > end) { + upper_start = end; + upper_end = r->end; + } + + /* 2. Drop the original ok overlapping range */ + drop_range(i); + + i--; /* resume for-loop on copied down entry */ + + /* 3. Add back in any non-overlapping ranges. */ + if (lower_end) + reserve_early_overlap_ok(lower_start, lower_end, name); + if (upper_end) + reserve_early_overlap_ok(upper_start, upper_end, name); + } +} + +static void __init __reserve_early(u64 start, u64 end, char *name, + int overlap_ok) +{ + int i; + struct early_res *r; + + i = find_overlapped_early(start, end); + if (i >= max_early_res) + panic("Too many early reservations"); + r = &early_res[i]; + if (r->end) + panic("Overlapping early reservations " + "%llx-%llx %s to %llx-%llx %s\n", + start, end - 1, name ? name : "", r->start, + r->end - 1, r->name); + r->start = start; + r->end = end; + r->overlap_ok = overlap_ok; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; +} + +/* + * A few early reservtations come here. + * + * The 'overlap_ok' in the name of this routine does -not- mean it + * is ok for these reservations to overlap an earlier reservation. + * Rather it means that it is ok for subsequent reservations to + * overlap this one. + * + * Use this entry point to reserve early ranges when you are doing + * so out of "Paranoia", reserving perhaps more memory than you need, + * just in case, and don't mind a subsequent overlapping reservation + * that is known to be needed. + * + * The drop_overlaps_that_are_ok() call here isn't really needed. + * It would be needed if we had two colliding 'overlap_ok' + * reservations, so that the second such would not panic on the + * overlap with the first. We don't have any such as of this + * writing, but might as well tolerate such if it happens in + * the future. + */ +void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) +{ + drop_overlaps_that_are_ok(start, end); + __reserve_early(start, end, name, 1); +} + +static void __init __check_and_double_early_res(u64 start) +{ + u64 end, size, mem; + struct early_res *new; + + /* do we have enough slots left ? */ + if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) + return; + + /* double it */ + end = max_pfn_mapped << PAGE_SHIFT; + size = sizeof(struct early_res) * max_early_res * 2; + mem = find_e820_area(start, end, size, sizeof(struct early_res)); + + if (mem == -1ULL) + panic("can not find more space for early_res array"); + + new = __va(mem); + /* save the first one for own */ + new[0].start = mem; + new[0].end = mem + size; + new[0].overlap_ok = 0; + /* copy old to new */ + if (early_res == early_res_x) { + memcpy(&new[1], &early_res[0], + sizeof(struct early_res) * max_early_res); + memset(&new[max_early_res+1], 0, + sizeof(struct early_res) * (max_early_res - 1)); + early_res_count++; + } else { + memcpy(&new[1], &early_res[1], + sizeof(struct early_res) * (max_early_res - 1)); + memset(&new[max_early_res], 0, + sizeof(struct early_res) * max_early_res); + } + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = new; + max_early_res *= 2; + printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", + max_early_res, mem, mem + size - 1); +} + +/* + * Most early reservations come here. + * + * We first have drop_overlaps_that_are_ok() drop any pre-existing + * 'overlap_ok' ranges, so that we can then reserve this memory + * range without risk of panic'ing on an overlapping overlap_ok + * early reservation. + */ +void __init reserve_early(u64 start, u64 end, char *name) +{ + if (start >= end) + return; + + __check_and_double_early_res(end); + + drop_overlaps_that_are_ok(start, end); + __reserve_early(start, end, name, 0); +} + +void __init reserve_early_without_check(u64 start, u64 end, char *name) +{ + struct early_res *r; + + if (start >= end) + return; + + __check_and_double_early_res(end); + + r = &early_res[early_res_count]; + + r->start = start; + r->end = end; + r->overlap_ok = 0; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; +} + +void __init free_early(u64 start, u64 end) +{ + struct early_res *r; + int i; + + i = find_overlapped_early(start, end); + r = &early_res[i]; + if (i >= max_early_res || r->end != end || r->start != start) + panic("free_early on not reserved area: %llx-%llx!", + start, end - 1); + + drop_range(i); +} + +#ifdef CONFIG_NO_BOOTMEM +static void __init subtract_early_res(struct range *range, int az) +{ + int i, count; + u64 final_start, final_end; + int idx = 0; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + +#define DEBUG_PRINT_EARLY_RES 1 + +#if DEBUG_PRINT_EARLY_RES + printk(KERN_INFO "Subtract (%d early reservations)\n", count); +#endif + for (i = idx; i < count; i++) { + struct early_res *r = &early_res[i]; +#if DEBUG_PRINT_EARLY_RES + printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, + r->start, r->end, r->name); +#endif + final_start = PFN_DOWN(r->start); + final_end = PFN_UP(r->end); + if (final_start >= final_end) + continue; + subtract_range(range, az, final_start, final_end); + } + +} + +int __init get_free_all_memory_range(struct range **rangep, int nodeid) +{ + int i, count; + u64 start = 0, end; + u64 size; + u64 mem; + struct range *range; + int nr_range; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + count *= 2; + + size = sizeof(struct range) * count; +#ifdef MAX_DMA32_PFN + if (max_pfn_mapped > MAX_DMA32_PFN) + start = MAX_DMA32_PFN << PAGE_SHIFT; +#endif + end = max_pfn_mapped << PAGE_SHIFT; + mem = find_e820_area(start, end, size, sizeof(struct range)); + if (mem == -1ULL) + panic("can not find more space for range free"); + + range = __va(mem); + /* use early_node_map[] and early_res to get range array at first */ + memset(range, 0, size); + nr_range = 0; + + /* need to go over early_node_map to find out good range for node */ + nr_range = add_from_early_node_map(range, count, nr_range, nodeid); + subtract_early_res(range, count); + nr_range = clean_sort_range(range, count); + + /* need to clear it ? */ + if (nodeid == MAX_NUMNODES) { + memset(&early_res[0], 0, + sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + } + + *rangep = range; + return nr_range; +} +#else +void __init early_res_to_bootmem(u64 start, u64 end) +{ + int i, count; + u64 final_start, final_end; + int idx = 0; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + + printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", + count - idx, max_early_res, start, end); + for (i = idx; i < count; i++) { + struct early_res *r = &early_res[i]; + printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, + r->start, r->end, r->name); + final_start = max(start, r->start); + final_end = min(end, r->end); + if (final_start >= final_end) { + printk(KERN_CONT "\n"); + continue; + } + printk(KERN_CONT " ==> [%010llx - %010llx]\n", + final_start, final_end); + reserve_bootmem_generic(final_start, final_end - final_start, + BOOTMEM_DEFAULT); + } + /* clear them */ + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + early_res_count = 0; +} +#endif + +/* Check for already reserved areas */ +static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) +{ + int i; + u64 addr = *addrp; + int changed = 0; + struct early_res *r; +again: + i = find_overlapped_early(addr, addr + size); + r = &early_res[i]; + if (i < max_early_res && r->end) { + *addrp = addr = round_up(r->end, align); + changed = 1; + goto again; + } + return changed; +} + +/* Check for already reserved areas */ +static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) +{ + int i; + u64 addr = *addrp, last; + u64 size = *sizep; + int changed = 0; +again: + last = addr + size; + for (i = 0; i < max_early_res && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + if (last > r->start && addr < r->start) { + size = r->start - addr; + changed = 1; + goto again; + } + if (last > r->end && addr < r->end) { + addr = round_up(r->end, align); + size = last - addr; + changed = 1; + goto again; + } + if (last <= r->end && addr >= r->start) { + (*sizep)++; + return 0; + } + } + if (changed) { + *addrp = addr; + *sizep = size; + } + return changed; +} + +/* + * Find a free area with specified alignment in a specific range. + * only with the area.between start to end is active range from early_node_map + * so they are good as RAM + */ +u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, + u64 size, u64 align) +{ + u64 addr, last; + + addr = round_up(ei_start, align); + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + goto out; + while (bad_addr(&addr, size, align) && addr+size <= ei_last) + ; + last = addr + size; + if (last > ei_last) + goto out; + if (last > end) + goto out; + + return addr; + +out: + return -1ULL; +} + +/* + * Find a free area with specified alignment in a specific range. + */ +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr; + u64 ei_start, ei_last; + + if (ei->type != E820_RAM) + continue; + + ei_last = ei->addr + ei->size; + ei_start = ei->addr; + addr = find_early_area(ei_start, ei_last, start, end, + size, align); + + if (addr == -1ULL) + continue; + + return addr; + } + return -1ULL; +} + +/* + * Find next free range after *start + */ +u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr, last; + u64 ei_last; + + if (ei->type != E820_RAM) + continue; + addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + continue; + *sizep = ei_last - addr; + while (bad_addr_size(&addr, sizep, align) && + addr + *sizep <= ei_last) + ; + last = addr + *sizep; + if (last > ei_last) + continue; + return addr; + } + + return -1ULL; +} -- cgit v1.2.3-70-g09d2 From 7da657d1f1dd27fa9d8289d5f7e53479c7fd3a95 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:25 -0800 Subject: x86: Add find_early_area_size Prepare to move bck find_e820_area_size back to e820.c. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-22-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/early_res.c | 45 ++++++++++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c index 1cf2c2f9ea6..bfa1ba705d4 100644 --- a/arch/x86/kernel/early_res.c +++ b/arch/x86/kernel/early_res.c @@ -476,6 +476,29 @@ out: return -1ULL; } +u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, + u64 *sizep, u64 align) +{ + u64 addr, last; + + addr = round_up(ei_start, align); + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + goto out; + *sizep = ei_last - addr; + while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) + ; + last = addr + *sizep; + if (last > ei_last) + goto out; + + return addr; + +out: + return -1ULL; +} + /* * Find a free area with specified alignment in a specific range. */ @@ -513,24 +536,20 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; - u64 addr, last; - u64 ei_last; + u64 addr; + u64 ei_start, ei_last; if (ei->type != E820_RAM) continue; - addr = round_up(ei->addr, align); + ei_last = ei->addr + ei->size; - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - *sizep = ei_last - addr; - while (bad_addr_size(&addr, sizep, align) && - addr + *sizep <= ei_last) - ; - last = addr + *sizep; - if (last > ei_last) + ei_start = ei->addr; + addr = find_early_area_size(ei_start, ei_last, start, + sizep, align); + + if (addr == -1ULL) continue; + return addr; } -- cgit v1.2.3-70-g09d2 From efdd0e81df0f23830c6d2cb971cf87f415b8dbdb Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:26 -0800 Subject: x86: Move back find_e820_area to e820.c Makes early_res.c more clean, so later could move it to /kernel. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-23-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/e820.h | 2 ++ arch/x86/include/asm/early_res.h | 4 +-- arch/x86/kernel/e820.c | 53 +++++++++++++++++++++++++++++++++++++ arch/x86/kernel/early_res.c | 57 ---------------------------------------- 4 files changed, 57 insertions(+), 59 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index efad699a2c2..a8299e13443 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -109,6 +109,8 @@ static inline void early_memtest(unsigned long start, unsigned long end) extern unsigned long end_user_pfn; +extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); +extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); #include diff --git a/arch/x86/include/asm/early_res.h b/arch/x86/include/asm/early_res.h index 2d43b166782..5a4d2eb8e79 100644 --- a/arch/x86/include/asm/early_res.h +++ b/arch/x86/include/asm/early_res.h @@ -2,8 +2,6 @@ #define _ASM_X86_EARLY_RES_H #ifdef __KERNEL__ -extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); -extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern void reserve_early(u64 start, u64 end, char *name); extern void reserve_early_overlap_ok(u64 start, u64 end, char *name); extern void free_early(u64 start, u64 end); @@ -12,6 +10,8 @@ extern void early_res_to_bootmem(u64 start, u64 end); void reserve_early_without_check(u64 start, u64 end, char *name); u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, u64 size, u64 align); +u64 find_early_area_size(u64 ei_start, u64 ei_last, u64 start, + u64 *sizep, u64 align); #include int get_free_all_memory_range(struct range **rangep, int nodeid); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 82db4015604..b4e512b03aa 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -722,6 +722,59 @@ static int __init e820_mark_nvs_memory(void) core_initcall(e820_mark_nvs_memory); #endif +/* + * Find a free area with specified alignment in a specific range. + */ +u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr; + u64 ei_start, ei_last; + + if (ei->type != E820_RAM) + continue; + + ei_last = ei->addr + ei->size; + ei_start = ei->addr; + addr = find_early_area(ei_start, ei_last, start, end, + size, align); + + if (addr != -1ULL) + return addr; + } + return -1ULL; +} + +/* + * Find next free range after *start + */ +u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + u64 addr; + u64 ei_start, ei_last; + + if (ei->type != E820_RAM) + continue; + + ei_last = ei->addr + ei->size; + ei_start = ei->addr; + addr = find_early_area_size(ei_start, ei_last, start, + sizep, align); + + if (addr != -1ULL) + return addr; + } + + return -1ULL; +} + /* * pre allocated 4k and reserved it in e820 */ diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c index bfa1ba705d4..1b99a2619f9 100644 --- a/arch/x86/kernel/early_res.c +++ b/arch/x86/kernel/early_res.c @@ -498,60 +498,3 @@ u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, out: return -1ULL; } - -/* - * Find a free area with specified alignment in a specific range. - */ -u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 addr; - u64 ei_start, ei_last; - - if (ei->type != E820_RAM) - continue; - - ei_last = ei->addr + ei->size; - ei_start = ei->addr; - addr = find_early_area(ei_start, ei_last, start, end, - size, align); - - if (addr == -1ULL) - continue; - - return addr; - } - return -1ULL; -} - -/* - * Find next free range after *start - */ -u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align) -{ - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - u64 addr; - u64 ei_start, ei_last; - - if (ei->type != E820_RAM) - continue; - - ei_last = ei->addr + ei->size; - ei_start = ei->addr; - addr = find_early_area_size(ei_start, ei_last, start, - sizep, align); - - if (addr == -1ULL) - continue; - - return addr; - } - - return -1ULL; -} -- cgit v1.2.3-70-g09d2 From 53db62a2529280ff216c941d8a2650204547e44a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:27 -0800 Subject: early_res: Enhance check_and_double_early_res ... to make it always try to start from low at first. This makes it less likely for early_memtest to reserve a bad range, in particular it puts new early_res in a range that is already tested. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-24-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/early_res.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c index 1b99a2619f9..dbf08bd0125 100644 --- a/arch/x86/kernel/early_res.c +++ b/arch/x86/kernel/early_res.c @@ -180,9 +180,9 @@ void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) __reserve_early(start, end, name, 1); } -static void __init __check_and_double_early_res(u64 start) +static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) { - u64 end, size, mem; + u64 start, end, size, mem; struct early_res *new; /* do we have enough slots left ? */ @@ -190,10 +190,23 @@ static void __init __check_and_double_early_res(u64 start) return; /* double it */ - end = max_pfn_mapped << PAGE_SHIFT; + mem = -1ULL; size = sizeof(struct early_res) * max_early_res * 2; - mem = find_e820_area(start, end, size, sizeof(struct early_res)); - + if (early_res == early_res_x) + start = 0; + else + start = early_res[0].end; + end = ex_start; + if (start + size < end) + mem = find_e820_area(start, end, size, + sizeof(struct early_res)); + if (mem == -1ULL) { + start = ex_end; + end = max_pfn_mapped << PAGE_SHIFT; + if (start + size < end) + mem = find_e820_area(start, end, size, + sizeof(struct early_res)); + } if (mem == -1ULL) panic("can not find more space for early_res array"); @@ -235,7 +248,7 @@ void __init reserve_early(u64 start, u64 end, char *name) if (start >= end) return; - __check_and_double_early_res(end); + __check_and_double_early_res(start, end); drop_overlaps_that_are_ok(start, end); __reserve_early(start, end, name, 0); @@ -248,7 +261,7 @@ void __init reserve_early_without_check(u64 start, u64 end, char *name) if (start >= end) return; - __check_and_double_early_res(end); + __check_and_double_early_res(start, end); r = &early_res[early_res_count]; -- cgit v1.2.3-70-g09d2 From 59be5a8e8ce765cf739ec7f07176219972de7481 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:28 -0800 Subject: x86: Make 32bit support NO_BOOTMEM Let's make 32bit consistent with 64bit. -v2: Andrew pointed out for 32bit that we should use -1ULL Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-25-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 1 - arch/x86/kernel/early_res.c | 3 +++ arch/x86/mm/init_32.c | 6 ++++++ arch/x86/mm/numa_32.c | 3 +++ 4 files changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 95439843ceb..29f9efb74fc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -571,7 +571,6 @@ config PARAVIRT_DEBUG config NO_BOOTMEM default y bool "Disable Bootmem code" - depends on X86_64 ---help--- Use early_res directly instead of bootmem before slab is ready. - allocator (buddy) [generic] diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c index dbf08bd0125..656cdf86a2f 100644 --- a/arch/x86/kernel/early_res.c +++ b/arch/x86/kernel/early_res.c @@ -354,6 +354,9 @@ int __init get_free_all_memory_range(struct range **rangep, int nodeid) /* need to go over early_node_map to find out good range for node */ nr_range = add_from_early_node_map(range, count, nr_range, nodeid); +#ifdef CONFIG_X86_32 + subtract_range(range, count, max_low_pfn, -1ULL); +#endif subtract_early_res(range, count); nr_range = clean_sort_range(range, count); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2dccde06d22..262867a7d43 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -748,6 +748,7 @@ static void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } +#ifndef CONFIG_NO_BOOTMEM static unsigned long __init setup_node_bootmem(int nodeid, unsigned long start_pfn, unsigned long end_pfn, @@ -767,9 +768,11 @@ static unsigned long __init setup_node_bootmem(int nodeid, return bootmap + bootmap_size; } +#endif void __init setup_bootmem_allocator(void) { +#ifndef CONFIG_NO_BOOTMEM int nodeid; unsigned long bootmap_size, bootmap; /* @@ -781,11 +784,13 @@ void __init setup_bootmem_allocator(void) if (bootmap == -1L) panic("Cannot find bootmem map of size %ld\n", bootmap_size); reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); +#endif printk(KERN_INFO " mapped low ram: 0 - %08lx\n", max_pfn_mapped<node_id = nid; +#ifndef CONFIG_NO_BOOTMEM NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; +#endif } setup_bootmem_allocator(); -- cgit v1.2.3-70-g09d2 From dd645cee7b50b61cb2d05b59eb6027679c437af6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:30 -0800 Subject: x86: Add find_fw_memmap_area ... so we can move early_res up. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-27-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/early_res.h | 1 + arch/x86/kernel/e820.c | 4 ++++ arch/x86/kernel/early_res.c | 17 +++++++++++------ 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/early_res.h b/arch/x86/include/asm/early_res.h index 5a4d2eb8e79..9758f3df9da 100644 --- a/arch/x86/include/asm/early_res.h +++ b/arch/x86/include/asm/early_res.h @@ -12,6 +12,7 @@ u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, u64 size, u64 align); u64 find_early_area_size(u64 ei_start, u64 ei_last, u64 start, u64 *sizep, u64 align); +u64 find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align); #include int get_free_all_memory_range(struct range **rangep, int nodeid); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b4e512b03aa..36918d8463a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -748,6 +748,10 @@ u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align) return -1ULL; } +u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) +{ + return find_e820_area(start, end, size, align); +} /* * Find next free range after *start */ diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c index 656cdf86a2f..1458dc02234 100644 --- a/arch/x86/kernel/early_res.c +++ b/arch/x86/kernel/early_res.c @@ -7,16 +7,14 @@ #include #include -#include #include -#include /* * Early reserved memory areas. */ /* * need to make sure this one is bigger enough before - * find_e820_area could be used + * find_fw_memmap_area could be used */ #define MAX_EARLY_RES_X 32 @@ -180,6 +178,13 @@ void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) __reserve_early(start, end, name, 1); } +u64 __init __weak find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) +{ + panic("should have find_fw_memmap_area defined with arch"); + + return -1ULL; +} + static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) { u64 start, end, size, mem; @@ -198,13 +203,13 @@ static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) start = early_res[0].end; end = ex_start; if (start + size < end) - mem = find_e820_area(start, end, size, + mem = find_fw_memmap_area(start, end, size, sizeof(struct early_res)); if (mem == -1ULL) { start = ex_end; end = max_pfn_mapped << PAGE_SHIFT; if (start + size < end) - mem = find_e820_area(start, end, size, + mem = find_fw_memmap_area(start, end, size, sizeof(struct early_res)); } if (mem == -1ULL) @@ -343,7 +348,7 @@ int __init get_free_all_memory_range(struct range **rangep, int nodeid) start = MAX_DMA32_PFN << PAGE_SHIFT; #endif end = max_pfn_mapped << PAGE_SHIFT; - mem = find_e820_area(start, end, size, sizeof(struct range)); + mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); if (mem == -1ULL) panic("can not find more space for range free"); -- cgit v1.2.3-70-g09d2 From 414bb144efa2d2fe16d104d836d0d6b6e9265788 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 14 Dec 2009 13:08:41 +0100 Subject: x86, cpu: Print AMD virtualization features in /proc/cpuinfo This patch adds code to cpu initialization path to detect the extended virtualization features of AMD cpus to show them in /proc/cpuinfo. Signed-off-by: Joerg Roedel LKML-Reference: <1260792521-15212-1-git-send-email-joerg.roedel@amd.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/cpufeature.h | 4 ++++ arch/x86/kernel/cpu/addon_cpuid_features.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 637e1ec963c..0cd82d06861 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -168,6 +168,10 @@ #define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ #define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ #define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ +#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */ +#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */ +#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */ +#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 468489b57aa..97ad79cdf68 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -32,6 +32,10 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, { 0, 0, 0, 0 } }; -- cgit v1.2.3-70-g09d2 From 942fa3b63eb525aa0512ba28c42e656d8efc6787 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 8 Feb 2010 10:03:17 +0000 Subject: x86, mtrr: Kill over the top warn Fixes bugzilla: http://bugzilla.kernel.org/show_bug.cgi?id=12558 Fixes bugzilla: http://bugzilla.kernel.org/show_bug.cgi?id=12317 (and if this really needed to be a warn you'd be responding to the bugs left in bugzilla from it...) Signed-off-by: Alan Cox LKML-Reference: <20100208100239.2568.2940.stgit@localhost.localdomain> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 4d755846fee..163e59e272d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -464,7 +464,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, tmp |= ~((1<<(hi - 1)) - 1); if (tmp != mask_lo) { - WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); mask_lo = tmp; } } -- cgit v1.2.3-70-g09d2 From dade7716925a4e9a31f249f9ca1ed4e2f1495a8c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Jul 2009 18:39:36 +0200 Subject: x86: Convert ioapic_lock and vector_lock to raw_spinlock Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/io_apic.c | 106 ++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 53 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 937150e4c06..d55e43d352b 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -73,8 +73,8 @@ */ int sis_apic_bug = -1; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -393,7 +393,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) struct irq_pin_list *entry; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, cfg->irq_2_pin) { unsigned int reg; int pin; @@ -402,11 +402,11 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg) reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ if (reg & IO_APIC_REDIR_REMOTE_IRR) { - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return true; } } - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return false; } @@ -420,10 +420,10 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) { union entry_union eu; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return eu.entry; } @@ -446,9 +446,9 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __ioapic_write_entry(apic, pin, e); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -461,10 +461,10 @@ static void ioapic_mask_entry(int apic, int pin) unsigned long flags; union entry_union eu = { .entry.mask = 1 }; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x10 + 2*pin, eu.w1); io_apic_write(apic, 0x11 + 2*pin, eu.w2); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } /* @@ -591,9 +591,9 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc) BUG_ON(!cfg); - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __mask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) @@ -601,9 +601,9 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) struct irq_cfg *cfg = desc->chip_data; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __unmask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void mask_IO_APIC_irq(unsigned int irq) @@ -1127,12 +1127,12 @@ void lock_vector_lock(void) /* Used to the online set of cpus does not change * during assign_irq_vector. */ - spin_lock(&vector_lock); + raw_spin_lock(&vector_lock); } void unlock_vector_lock(void) { - spin_unlock(&vector_lock); + raw_spin_unlock(&vector_lock); } static int @@ -1220,9 +1220,9 @@ int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) int err; unsigned long flags; - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); err = __assign_irq_vector(irq, cfg, mask); - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); return err; } @@ -1265,7 +1265,7 @@ void __setup_vector_irq(int cpu) * assignments that might be happening on another cpu in parallel, * while we setup our initial vector to irq mappings. */ - spin_lock(&vector_lock); + raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { cfg = desc->chip_data; @@ -1284,7 +1284,7 @@ void __setup_vector_irq(int cpu) if (!cpumask_test_cpu(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } - spin_unlock(&vector_lock); + raw_spin_unlock(&vector_lock); } static struct irq_chip ioapic_chip; @@ -1603,14 +1603,14 @@ __apicdebuginit(void) print_IO_APIC(void) for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic, 0); reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); if (reg_01.bits.version >= 0x20) reg_03.raw = io_apic_read(apic, 3); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); @@ -1905,9 +1905,9 @@ void __init enable_IO_APIC(void) * The number of IO-APIC IRQ registers (== #pins): */ for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } @@ -2047,9 +2047,9 @@ void __init setup_ioapic_ids_from_mpc(void) for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic_id, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); old_id = mp_ioapics[apic_id].apicid; @@ -2108,16 +2108,16 @@ void __init setup_ioapic_ids_from_mpc(void) mp_ioapics[apic_id].apicid); reg_00.bits.ID = mp_ioapics[apic_id].apicid; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic_id, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); /* * Sanity check */ - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic_id, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) printk("could not set ID!\n"); else @@ -2200,7 +2200,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) unsigned long flags; struct irq_cfg *cfg; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); if (irq < nr_legacy_irqs) { disable_8259A_irq(irq); if (i8259A_irq_pending(irq)) @@ -2208,7 +2208,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) } cfg = irq_cfg(irq); __unmask_IO_APIC_irq(cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return was_pending; } @@ -2219,9 +2219,9 @@ static int ioapic_retrigger_irq(unsigned int irq) struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); return 1; } @@ -2314,14 +2314,14 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) irq = desc->irq; cfg = desc->chip_data; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); ret = set_desc_affinity(desc, mask, &dest); if (!ret) { /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); __target_IO_APIC_irq(irq, dest, cfg); } - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return ret; } @@ -2549,9 +2549,9 @@ static void eoi_ioapic_irq(struct irq_desc *desc) irq = desc->irq; cfg = desc->chip_data; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); } static void ack_apic_level(unsigned int irq) @@ -3133,13 +3133,13 @@ static int ioapic_resume(struct sys_device *dev) data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(dev->id, 0); if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { reg_00.bits.ID = mp_ioapics[dev->id].apicid; io_apic_write(dev->id, 0, reg_00.raw); } - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); for (i = 0; i < nr_ioapic_registers[dev->id]; i++) ioapic_write_entry(dev->id, i, entry[i]); @@ -3202,7 +3202,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) if (irq_want < nr_irqs_gsi) irq_want = nr_irqs_gsi; - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); for (new = irq_want; new < nr_irqs; new++) { desc_new = irq_to_desc_alloc_node(new, node); if (!desc_new) { @@ -3221,7 +3221,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) irq = new; break; } - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); if (irq > 0) { dynamic_irq_init(irq); @@ -3261,9 +3261,9 @@ void destroy_irq(unsigned int irq) desc->chip_data = cfg; free_irte(irq); - spin_lock_irqsave(&vector_lock, flags); + raw_spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq, cfg); - spin_unlock_irqrestore(&vector_lock, flags); + raw_spin_unlock_irqrestore(&vector_lock, flags); } /* @@ -3800,9 +3800,9 @@ int __init io_apic_get_redir_entries (int ioapic) union IO_APIC_reg_01 reg_01; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.entries; } @@ -3964,9 +3964,9 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) if (physids_empty(apic_id_map)) apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map); - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); if (apic_id >= get_physical_broadcast()) { printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " @@ -4000,10 +4000,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) if (reg_00.bits.ID != apic_id) { reg_00.bits.ID = apic_id; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(ioapic, 0, reg_00.raw); reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); /* Sanity check */ if (reg_00.bits.ID != apic_id) { @@ -4024,9 +4024,9 @@ int __init io_apic_get_version(int ioapic) union IO_APIC_reg_01 reg_01; unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + raw_spin_lock_irqsave(&ioapic_lock, flags); reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); + raw_spin_unlock_irqrestore(&ioapic_lock, flags); return reg_01.bits.version; } -- cgit v1.2.3-70-g09d2 From 40d6753e78a602bdf62e7741c0caa36474882f00 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Jul 2009 18:33:11 +0200 Subject: x86: Convert set_atomicity_lock to raw_spinlock Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mtrr/generic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 163e59e272d..9aa5dc76ff4 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -570,7 +570,7 @@ static unsigned long set_mtrr_state(void) static unsigned long cr4; -static DEFINE_SPINLOCK(set_atomicity_lock); +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); /* * Since we are disabling the cache don't allow any interrupts, @@ -590,7 +590,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) * changes to the way the kernel boots */ - spin_lock(&set_atomicity_lock); + raw_spin_lock(&set_atomicity_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; @@ -627,7 +627,7 @@ static void post_set(void) __releases(set_atomicity_lock) /* Restore value of CR4 */ if (cpu_has_pge) write_cr4(cr4); - spin_unlock(&set_atomicity_lock); + raw_spin_unlock(&set_atomicity_lock); } static void generic_set_all(void) -- cgit v1.2.3-70-g09d2 From 0fdc7a8022c3eaff6b5ee27ffb9e913e5e58d8e9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Jul 2009 16:49:55 +0200 Subject: x86: Convert nmi_lock to raw_spinlock nmi_lock must be a spinning spinlock in -rt. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/apic/nmi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 0159a69396c..24e7742d633 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -416,13 +416,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) /* We can be called before check_nmi_watchdog, hence NULL check. */ if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */ - spin_lock(&lock); + raw_spin_lock(&lock); printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); show_regs(regs); dump_stack(); - spin_unlock(&lock); + raw_spin_unlock(&lock); cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); rc = 1; -- cgit v1.2.3-70-g09d2 From 5619c28061ff9d2559a93eaba492935530f2a513 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 25 Jul 2009 18:35:11 +0200 Subject: x86: Convert i8259_lock to raw_spinlock Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/i8259.h | 2 +- arch/x86/kernel/apic/io_apic.c | 4 ++-- arch/x86/kernel/i8259.c | 30 +++++++++++++++--------------- arch/x86/kernel/time.c | 4 ++-- arch/x86/kernel/visws_quirks.c | 6 +++--- 5 files changed, 23 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 58d7091eeb1..7ec65b18085 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -24,7 +24,7 @@ extern unsigned int cached_irq_mask; #define SLAVE_ICW4_DEFAULT 0x01 #define PIC_ICW4_AEOI 2 -extern spinlock_t i8259A_lock; +extern raw_spinlock_t i8259A_lock; extern void init_8259A(int auto_eoi); extern void enable_8259A_irq(unsigned int irq); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index c86591b906f..f5e40339622 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1830,7 +1830,7 @@ __apicdebuginit(void) print_PIC(void) printk(KERN_DEBUG "\nprinting PIC contents\n"); - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); v = inb(0xa1) << 8 | inb(0x21); printk(KERN_DEBUG "... PIC IMR: %04x\n", v); @@ -1844,7 +1844,7 @@ __apicdebuginit(void) print_PIC(void) outb(0x0a,0xa0); outb(0x0a,0x20); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); printk(KERN_DEBUG "... PIC ISR: %04x\n", v); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index df89102bef8..8c93a84bb62 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -32,7 +32,7 @@ */ static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_RAW_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); struct irq_chip i8259A_chip = { @@ -68,13 +68,13 @@ void disable_8259A_irq(unsigned int irq) unsigned int mask = 1 << irq; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask |= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } void enable_8259A_irq(unsigned int irq) @@ -82,13 +82,13 @@ void enable_8259A_irq(unsigned int irq) unsigned int mask = ~(1 << irq); unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask &= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } int i8259A_irq_pending(unsigned int irq) @@ -97,12 +97,12 @@ int i8259A_irq_pending(unsigned int irq) unsigned long flags; int ret; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); if (irq < 8) ret = inb(PIC_MASTER_CMD) & mask; else ret = inb(PIC_SLAVE_CMD) & (mask >> 8); - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); return ret; } @@ -150,7 +150,7 @@ static void mask_and_ack_8259A(unsigned int irq) unsigned int irqmask = 1 << irq; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); /* * Lightweight spurious IRQ detection. We do not want * to overdo spurious IRQ handling - it's usually a sign @@ -183,7 +183,7 @@ handle_real_irq: outb(cached_master_mask, PIC_MASTER_IMR); outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ } - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); return; spurious_8259A_irq: @@ -285,24 +285,24 @@ void mask_8259A(void) { unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } void unmask_8259A(void) { unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } void init_8259A(int auto_eoi) @@ -311,7 +311,7 @@ void init_8259A(int auto_eoi) i8259A_auto_eoi = auto_eoi; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ @@ -356,5 +356,5 @@ void init_8259A(int auto_eoi) outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); } diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index be2573448ed..fb5cc5e14cf 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -70,11 +70,11 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) * manually to deassert NMI lines for the watchdog if run * on an 82489DX-based system. */ - spin_lock(&i8259A_lock); + raw_spin_lock(&i8259A_lock); outb(0x0c, PIC_MASTER_OCW3); /* Ack the IRQ; AEOI will end it automatically. */ inb(PIC_MASTER_POLL); - spin_unlock(&i8259A_lock); + raw_spin_unlock(&i8259A_lock); } global_clock_event->event_handler(global_clock_event); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 34a279a7471..ab38ce0984f 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -559,7 +559,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) struct irq_desc *desc; unsigned long flags; - spin_lock_irqsave(&i8259A_lock, flags); + raw_spin_lock_irqsave(&i8259A_lock, flags); /* Find out what's interrupting in the PIIX4 master 8259 */ outb(0x0c, 0x20); /* OCW3 Poll command */ @@ -596,7 +596,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) outb(0x60 + realirq, 0x20); } - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); desc = irq_to_desc(realirq); @@ -614,7 +614,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) return IRQ_HANDLED; out_unlock: - spin_unlock_irqrestore(&i8259A_lock, flags); + raw_spin_unlock_irqrestore(&i8259A_lock, flags); return IRQ_NONE; } -- cgit v1.2.3-70-g09d2 From 580e0ad21d6d6f932461d24b47041e3dd499c23f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 16 Feb 2010 18:40:35 -0800 Subject: core: Move early_res from arch/x86 to kernel/ This makes the range reservation feature available to other architectures. -v2: add get_max_mapped, max_pfn_mapped only defined in x86... to fix PPC compiling -v3: according to hpa, add CONFIG_HAVE_EARLY_RES -v4: fix typo about EARLY_RES in config Signed-off-by: Yinghai Lu LKML-Reference: <4B7B5723.4070009@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 3 + arch/x86/include/asm/e820.h | 2 +- arch/x86/include/asm/early_res.h | 21 -- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/e820.c | 10 +- arch/x86/kernel/early_res.c | 521 --------------------------------------- include/linux/early_res.h | 22 ++ kernel/Makefile | 1 + kernel/early_res.c | 513 ++++++++++++++++++++++++++++++++++++++ 9 files changed, 550 insertions(+), 545 deletions(-) delete mode 100644 arch/x86/include/asm/early_res.h delete mode 100644 arch/x86/kernel/early_res.c create mode 100644 include/linux/early_res.h create mode 100644 kernel/early_res.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 29f9efb74fc..0e9f8b10de5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -183,6 +183,9 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config HAVE_EARLY_RES + def_bool y + config HAVE_INTEL_TXT def_bool y depends on EXPERIMENTAL && DMAR && ACPI diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index a8299e13443..0e22296790d 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -112,7 +112,7 @@ extern unsigned long end_user_pfn; extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); -#include +#include extern unsigned long e820_end_of_ram_pfn(void); extern unsigned long e820_end_of_low_ram_pfn(void); diff --git a/arch/x86/include/asm/early_res.h b/arch/x86/include/asm/early_res.h deleted file mode 100644 index 9758f3df9da..00000000000 --- a/arch/x86/include/asm/early_res.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef _ASM_X86_EARLY_RES_H -#define _ASM_X86_EARLY_RES_H -#ifdef __KERNEL__ - -extern void reserve_early(u64 start, u64 end, char *name); -extern void reserve_early_overlap_ok(u64 start, u64 end, char *name); -extern void free_early(u64 start, u64 end); -extern void early_res_to_bootmem(u64 start, u64 end); - -void reserve_early_without_check(u64 start, u64 end, char *name); -u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, - u64 size, u64 align); -u64 find_early_area_size(u64 ei_start, u64 ei_last, u64 start, - u64 *sizep, u64 align); -u64 find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align); -#include -int get_free_all_memory_range(struct range **rangep, int nodeid); - -#endif /* __KERNEL__ */ - -#endif /* _ASM_X86_EARLY_RES_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f5fb9f0b627..d87f09bc5a5 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -38,7 +38,7 @@ obj-$(CONFIG_X86_32) += probe_roms_32.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o -obj-y += bootflag.o e820.o early_res.o +obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o obj-y += tsc.o io_delay.o rtc.o diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 36918d8463a..740b440fbd7 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -17,7 +17,6 @@ #include #include -#include #include #include @@ -752,6 +751,15 @@ u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) { return find_e820_area(start, end, size, align); } + +u64 __init get_max_mapped(void) +{ + u64 end = max_pfn_mapped; + + end <<= PAGE_SHIFT; + + return end; +} /* * Find next free range after *start */ diff --git a/arch/x86/kernel/early_res.c b/arch/x86/kernel/early_res.c deleted file mode 100644 index 1458dc02234..00000000000 --- a/arch/x86/kernel/early_res.c +++ /dev/null @@ -1,521 +0,0 @@ -/* - * early_res, could be used to replace bootmem - */ -#include -#include -#include -#include -#include - -#include - -/* - * Early reserved memory areas. - */ -/* - * need to make sure this one is bigger enough before - * find_fw_memmap_area could be used - */ -#define MAX_EARLY_RES_X 32 - -struct early_res { - u64 start, end; - char name[15]; - char overlap_ok; -}; -static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; - -static int max_early_res __initdata = MAX_EARLY_RES_X; -static struct early_res *early_res __initdata = &early_res_x[0]; -static int early_res_count __initdata; - -static int __init find_overlapped_early(u64 start, u64 end) -{ - int i; - struct early_res *r; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - if (end > r->start && start < r->end) - break; - } - - return i; -} - -/* - * Drop the i-th range from the early reservation map, - * by copying any higher ranges down one over it, and - * clearing what had been the last slot. - */ -static void __init drop_range(int i) -{ - int j; - - for (j = i + 1; j < max_early_res && early_res[j].end; j++) - ; - - memmove(&early_res[i], &early_res[i + 1], - (j - 1 - i) * sizeof(struct early_res)); - - early_res[j - 1].end = 0; - early_res_count--; -} - -/* - * Split any existing ranges that: - * 1) are marked 'overlap_ok', and - * 2) overlap with the stated range [start, end) - * into whatever portion (if any) of the existing range is entirely - * below or entirely above the stated range. Drop the portion - * of the existing range that overlaps with the stated range, - * which will allow the caller of this routine to then add that - * stated range without conflicting with any existing range. - */ -static void __init drop_overlaps_that_are_ok(u64 start, u64 end) -{ - int i; - struct early_res *r; - u64 lower_start, lower_end; - u64 upper_start, upper_end; - char name[15]; - - for (i = 0; i < max_early_res && early_res[i].end; i++) { - r = &early_res[i]; - - /* Continue past non-overlapping ranges */ - if (end <= r->start || start >= r->end) - continue; - - /* - * Leave non-ok overlaps as is; let caller - * panic "Overlapping early reservations" - * when it hits this overlap. - */ - if (!r->overlap_ok) - return; - - /* - * We have an ok overlap. We will drop it from the early - * reservation map, and add back in any non-overlapping - * portions (lower or upper) as separate, overlap_ok, - * non-overlapping ranges. - */ - - /* 1. Note any non-overlapping (lower or upper) ranges. */ - strncpy(name, r->name, sizeof(name) - 1); - - lower_start = lower_end = 0; - upper_start = upper_end = 0; - if (r->start < start) { - lower_start = r->start; - lower_end = start; - } - if (r->end > end) { - upper_start = end; - upper_end = r->end; - } - - /* 2. Drop the original ok overlapping range */ - drop_range(i); - - i--; /* resume for-loop on copied down entry */ - - /* 3. Add back in any non-overlapping ranges. */ - if (lower_end) - reserve_early_overlap_ok(lower_start, lower_end, name); - if (upper_end) - reserve_early_overlap_ok(upper_start, upper_end, name); - } -} - -static void __init __reserve_early(u64 start, u64 end, char *name, - int overlap_ok) -{ - int i; - struct early_res *r; - - i = find_overlapped_early(start, end); - if (i >= max_early_res) - panic("Too many early reservations"); - r = &early_res[i]; - if (r->end) - panic("Overlapping early reservations " - "%llx-%llx %s to %llx-%llx %s\n", - start, end - 1, name ? name : "", r->start, - r->end - 1, r->name); - r->start = start; - r->end = end; - r->overlap_ok = overlap_ok; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -/* - * A few early reservtations come here. - * - * The 'overlap_ok' in the name of this routine does -not- mean it - * is ok for these reservations to overlap an earlier reservation. - * Rather it means that it is ok for subsequent reservations to - * overlap this one. - * - * Use this entry point to reserve early ranges when you are doing - * so out of "Paranoia", reserving perhaps more memory than you need, - * just in case, and don't mind a subsequent overlapping reservation - * that is known to be needed. - * - * The drop_overlaps_that_are_ok() call here isn't really needed. - * It would be needed if we had two colliding 'overlap_ok' - * reservations, so that the second such would not panic on the - * overlap with the first. We don't have any such as of this - * writing, but might as well tolerate such if it happens in - * the future. - */ -void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) -{ - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 1); -} - -u64 __init __weak find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align) -{ - panic("should have find_fw_memmap_area defined with arch"); - - return -1ULL; -} - -static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) -{ - u64 start, end, size, mem; - struct early_res *new; - - /* do we have enough slots left ? */ - if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) - return; - - /* double it */ - mem = -1ULL; - size = sizeof(struct early_res) * max_early_res * 2; - if (early_res == early_res_x) - start = 0; - else - start = early_res[0].end; - end = ex_start; - if (start + size < end) - mem = find_fw_memmap_area(start, end, size, - sizeof(struct early_res)); - if (mem == -1ULL) { - start = ex_end; - end = max_pfn_mapped << PAGE_SHIFT; - if (start + size < end) - mem = find_fw_memmap_area(start, end, size, - sizeof(struct early_res)); - } - if (mem == -1ULL) - panic("can not find more space for early_res array"); - - new = __va(mem); - /* save the first one for own */ - new[0].start = mem; - new[0].end = mem + size; - new[0].overlap_ok = 0; - /* copy old to new */ - if (early_res == early_res_x) { - memcpy(&new[1], &early_res[0], - sizeof(struct early_res) * max_early_res); - memset(&new[max_early_res+1], 0, - sizeof(struct early_res) * (max_early_res - 1)); - early_res_count++; - } else { - memcpy(&new[1], &early_res[1], - sizeof(struct early_res) * (max_early_res - 1)); - memset(&new[max_early_res], 0, - sizeof(struct early_res) * max_early_res); - } - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = new; - max_early_res *= 2; - printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", - max_early_res, mem, mem + size - 1); -} - -/* - * Most early reservations come here. - * - * We first have drop_overlaps_that_are_ok() drop any pre-existing - * 'overlap_ok' ranges, so that we can then reserve this memory - * range without risk of panic'ing on an overlapping overlap_ok - * early reservation. - */ -void __init reserve_early(u64 start, u64 end, char *name) -{ - if (start >= end) - return; - - __check_and_double_early_res(start, end); - - drop_overlaps_that_are_ok(start, end); - __reserve_early(start, end, name, 0); -} - -void __init reserve_early_without_check(u64 start, u64 end, char *name) -{ - struct early_res *r; - - if (start >= end) - return; - - __check_and_double_early_res(start, end); - - r = &early_res[early_res_count]; - - r->start = start; - r->end = end; - r->overlap_ok = 0; - if (name) - strncpy(r->name, name, sizeof(r->name) - 1); - early_res_count++; -} - -void __init free_early(u64 start, u64 end) -{ - struct early_res *r; - int i; - - i = find_overlapped_early(start, end); - r = &early_res[i]; - if (i >= max_early_res || r->end != end || r->start != start) - panic("free_early on not reserved area: %llx-%llx!", - start, end - 1); - - drop_range(i); -} - -#ifdef CONFIG_NO_BOOTMEM -static void __init subtract_early_res(struct range *range, int az) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - -#define DEBUG_PRINT_EARLY_RES 1 - -#if DEBUG_PRINT_EARLY_RES - printk(KERN_INFO "Subtract (%d early reservations)\n", count); -#endif - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; -#if DEBUG_PRINT_EARLY_RES - printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, - r->start, r->end, r->name); -#endif - final_start = PFN_DOWN(r->start); - final_end = PFN_UP(r->end); - if (final_start >= final_end) - continue; - subtract_range(range, az, final_start, final_end); - } - -} - -int __init get_free_all_memory_range(struct range **rangep, int nodeid) -{ - int i, count; - u64 start = 0, end; - u64 size; - u64 mem; - struct range *range; - int nr_range; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - count *= 2; - - size = sizeof(struct range) * count; -#ifdef MAX_DMA32_PFN - if (max_pfn_mapped > MAX_DMA32_PFN) - start = MAX_DMA32_PFN << PAGE_SHIFT; -#endif - end = max_pfn_mapped << PAGE_SHIFT; - mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); - if (mem == -1ULL) - panic("can not find more space for range free"); - - range = __va(mem); - /* use early_node_map[] and early_res to get range array at first */ - memset(range, 0, size); - nr_range = 0; - - /* need to go over early_node_map to find out good range for node */ - nr_range = add_from_early_node_map(range, count, nr_range, nodeid); -#ifdef CONFIG_X86_32 - subtract_range(range, count, max_low_pfn, -1ULL); -#endif - subtract_early_res(range, count); - nr_range = clean_sort_range(range, count); - - /* need to clear it ? */ - if (nodeid == MAX_NUMNODES) { - memset(&early_res[0], 0, - sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - } - - *rangep = range; - return nr_range; -} -#else -void __init early_res_to_bootmem(u64 start, u64 end) -{ - int i, count; - u64 final_start, final_end; - int idx = 0; - - count = 0; - for (i = 0; i < max_early_res && early_res[i].end; i++) - count++; - - /* need to skip first one ?*/ - if (early_res != early_res_x) - idx = 1; - - printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", - count - idx, max_early_res, start, end); - for (i = idx; i < count; i++) { - struct early_res *r = &early_res[i]; - printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, - r->start, r->end, r->name); - final_start = max(start, r->start); - final_end = min(end, r->end); - if (final_start >= final_end) { - printk(KERN_CONT "\n"); - continue; - } - printk(KERN_CONT " ==> [%010llx - %010llx]\n", - final_start, final_end); - reserve_bootmem_generic(final_start, final_end - final_start, - BOOTMEM_DEFAULT); - } - /* clear them */ - memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); - early_res = NULL; - max_early_res = 0; - early_res_count = 0; -} -#endif - -/* Check for already reserved areas */ -static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) -{ - int i; - u64 addr = *addrp; - int changed = 0; - struct early_res *r; -again: - i = find_overlapped_early(addr, addr + size); - r = &early_res[i]; - if (i < max_early_res && r->end) { - *addrp = addr = round_up(r->end, align); - changed = 1; - goto again; - } - return changed; -} - -/* Check for already reserved areas */ -static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) -{ - int i; - u64 addr = *addrp, last; - u64 size = *sizep; - int changed = 0; -again: - last = addr + size; - for (i = 0; i < max_early_res && early_res[i].end; i++) { - struct early_res *r = &early_res[i]; - if (last > r->start && addr < r->start) { - size = r->start - addr; - changed = 1; - goto again; - } - if (last > r->end && addr < r->end) { - addr = round_up(r->end, align); - size = last - addr; - changed = 1; - goto again; - } - if (last <= r->end && addr >= r->start) { - (*sizep)++; - return 0; - } - } - if (changed) { - *addrp = addr; - *sizep = size; - } - return changed; -} - -/* - * Find a free area with specified alignment in a specific range. - * only with the area.between start to end is active range from early_node_map - * so they are good as RAM - */ -u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, - u64 size, u64 align) -{ - u64 addr, last; - - addr = round_up(ei_start, align); - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - goto out; - while (bad_addr(&addr, size, align) && addr+size <= ei_last) - ; - last = addr + size; - if (last > ei_last) - goto out; - if (last > end) - goto out; - - return addr; - -out: - return -1ULL; -} - -u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, - u64 *sizep, u64 align) -{ - u64 addr, last; - - addr = round_up(ei_start, align); - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - goto out; - *sizep = ei_last - addr; - while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) - ; - last = addr + *sizep; - if (last > ei_last) - goto out; - - return addr; - -out: - return -1ULL; -} diff --git a/include/linux/early_res.h b/include/linux/early_res.h new file mode 100644 index 00000000000..50f7663bb8b --- /dev/null +++ b/include/linux/early_res.h @@ -0,0 +1,22 @@ +#ifndef _LINUX_EARLY_RES_H +#define _LINUX_EARLY_RES_H +#ifdef __KERNEL__ + +extern void reserve_early(u64 start, u64 end, char *name); +extern void reserve_early_overlap_ok(u64 start, u64 end, char *name); +extern void free_early(u64 start, u64 end); +extern void early_res_to_bootmem(u64 start, u64 end); + +void reserve_early_without_check(u64 start, u64 end, char *name); +u64 find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, + u64 size, u64 align); +u64 find_early_area_size(u64 ei_start, u64 ei_last, u64 start, + u64 *sizep, u64 align); +u64 find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align); +u64 get_max_mapped(void); +#include +int get_free_all_memory_range(struct range **rangep, int nodeid); + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_EARLY_RES_H */ diff --git a/kernel/Makefile b/kernel/Makefile index ad47330ccf3..1292b863d66 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,6 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ async.o range.o +obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o obj-y += groups.o ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/early_res.c b/kernel/early_res.c new file mode 100644 index 00000000000..aa5494ac446 --- /dev/null +++ b/kernel/early_res.c @@ -0,0 +1,513 @@ +/* + * early_res, could be used to replace bootmem + */ +#include +#include +#include +#include +#include +#include + +/* + * Early reserved memory areas. + */ +/* + * need to make sure this one is bigger enough before + * find_fw_memmap_area could be used + */ +#define MAX_EARLY_RES_X 32 + +struct early_res { + u64 start, end; + char name[15]; + char overlap_ok; +}; +static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; + +static int max_early_res __initdata = MAX_EARLY_RES_X; +static struct early_res *early_res __initdata = &early_res_x[0]; +static int early_res_count __initdata; + +static int __init find_overlapped_early(u64 start, u64 end) +{ + int i; + struct early_res *r; + + for (i = 0; i < max_early_res && early_res[i].end; i++) { + r = &early_res[i]; + if (end > r->start && start < r->end) + break; + } + + return i; +} + +/* + * Drop the i-th range from the early reservation map, + * by copying any higher ranges down one over it, and + * clearing what had been the last slot. + */ +static void __init drop_range(int i) +{ + int j; + + for (j = i + 1; j < max_early_res && early_res[j].end; j++) + ; + + memmove(&early_res[i], &early_res[i + 1], + (j - 1 - i) * sizeof(struct early_res)); + + early_res[j - 1].end = 0; + early_res_count--; +} + +/* + * Split any existing ranges that: + * 1) are marked 'overlap_ok', and + * 2) overlap with the stated range [start, end) + * into whatever portion (if any) of the existing range is entirely + * below or entirely above the stated range. Drop the portion + * of the existing range that overlaps with the stated range, + * which will allow the caller of this routine to then add that + * stated range without conflicting with any existing range. + */ +static void __init drop_overlaps_that_are_ok(u64 start, u64 end) +{ + int i; + struct early_res *r; + u64 lower_start, lower_end; + u64 upper_start, upper_end; + char name[15]; + + for (i = 0; i < max_early_res && early_res[i].end; i++) { + r = &early_res[i]; + + /* Continue past non-overlapping ranges */ + if (end <= r->start || start >= r->end) + continue; + + /* + * Leave non-ok overlaps as is; let caller + * panic "Overlapping early reservations" + * when it hits this overlap. + */ + if (!r->overlap_ok) + return; + + /* + * We have an ok overlap. We will drop it from the early + * reservation map, and add back in any non-overlapping + * portions (lower or upper) as separate, overlap_ok, + * non-overlapping ranges. + */ + + /* 1. Note any non-overlapping (lower or upper) ranges. */ + strncpy(name, r->name, sizeof(name) - 1); + + lower_start = lower_end = 0; + upper_start = upper_end = 0; + if (r->start < start) { + lower_start = r->start; + lower_end = start; + } + if (r->end > end) { + upper_start = end; + upper_end = r->end; + } + + /* 2. Drop the original ok overlapping range */ + drop_range(i); + + i--; /* resume for-loop on copied down entry */ + + /* 3. Add back in any non-overlapping ranges. */ + if (lower_end) + reserve_early_overlap_ok(lower_start, lower_end, name); + if (upper_end) + reserve_early_overlap_ok(upper_start, upper_end, name); + } +} + +static void __init __reserve_early(u64 start, u64 end, char *name, + int overlap_ok) +{ + int i; + struct early_res *r; + + i = find_overlapped_early(start, end); + if (i >= max_early_res) + panic("Too many early reservations"); + r = &early_res[i]; + if (r->end) + panic("Overlapping early reservations " + "%llx-%llx %s to %llx-%llx %s\n", + start, end - 1, name ? name : "", r->start, + r->end - 1, r->name); + r->start = start; + r->end = end; + r->overlap_ok = overlap_ok; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; +} + +/* + * A few early reservtations come here. + * + * The 'overlap_ok' in the name of this routine does -not- mean it + * is ok for these reservations to overlap an earlier reservation. + * Rather it means that it is ok for subsequent reservations to + * overlap this one. + * + * Use this entry point to reserve early ranges when you are doing + * so out of "Paranoia", reserving perhaps more memory than you need, + * just in case, and don't mind a subsequent overlapping reservation + * that is known to be needed. + * + * The drop_overlaps_that_are_ok() call here isn't really needed. + * It would be needed if we had two colliding 'overlap_ok' + * reservations, so that the second such would not panic on the + * overlap with the first. We don't have any such as of this + * writing, but might as well tolerate such if it happens in + * the future. + */ +void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) +{ + drop_overlaps_that_are_ok(start, end); + __reserve_early(start, end, name, 1); +} + +static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) +{ + u64 start, end, size, mem; + struct early_res *new; + + /* do we have enough slots left ? */ + if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) + return; + + /* double it */ + mem = -1ULL; + size = sizeof(struct early_res) * max_early_res * 2; + if (early_res == early_res_x) + start = 0; + else + start = early_res[0].end; + end = ex_start; + if (start + size < end) + mem = find_fw_memmap_area(start, end, size, + sizeof(struct early_res)); + if (mem == -1ULL) { + start = ex_end; + end = get_max_mapped(); + if (start + size < end) + mem = find_fw_memmap_area(start, end, size, + sizeof(struct early_res)); + } + if (mem == -1ULL) + panic("can not find more space for early_res array"); + + new = __va(mem); + /* save the first one for own */ + new[0].start = mem; + new[0].end = mem + size; + new[0].overlap_ok = 0; + /* copy old to new */ + if (early_res == early_res_x) { + memcpy(&new[1], &early_res[0], + sizeof(struct early_res) * max_early_res); + memset(&new[max_early_res+1], 0, + sizeof(struct early_res) * (max_early_res - 1)); + early_res_count++; + } else { + memcpy(&new[1], &early_res[1], + sizeof(struct early_res) * (max_early_res - 1)); + memset(&new[max_early_res], 0, + sizeof(struct early_res) * max_early_res); + } + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = new; + max_early_res *= 2; + printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", + max_early_res, mem, mem + size - 1); +} + +/* + * Most early reservations come here. + * + * We first have drop_overlaps_that_are_ok() drop any pre-existing + * 'overlap_ok' ranges, so that we can then reserve this memory + * range without risk of panic'ing on an overlapping overlap_ok + * early reservation. + */ +void __init reserve_early(u64 start, u64 end, char *name) +{ + if (start >= end) + return; + + __check_and_double_early_res(start, end); + + drop_overlaps_that_are_ok(start, end); + __reserve_early(start, end, name, 0); +} + +void __init reserve_early_without_check(u64 start, u64 end, char *name) +{ + struct early_res *r; + + if (start >= end) + return; + + __check_and_double_early_res(start, end); + + r = &early_res[early_res_count]; + + r->start = start; + r->end = end; + r->overlap_ok = 0; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; +} + +void __init free_early(u64 start, u64 end) +{ + struct early_res *r; + int i; + + i = find_overlapped_early(start, end); + r = &early_res[i]; + if (i >= max_early_res || r->end != end || r->start != start) + panic("free_early on not reserved area: %llx-%llx!", + start, end - 1); + + drop_range(i); +} + +#ifdef CONFIG_NO_BOOTMEM +static void __init subtract_early_res(struct range *range, int az) +{ + int i, count; + u64 final_start, final_end; + int idx = 0; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + +#define DEBUG_PRINT_EARLY_RES 1 + +#if DEBUG_PRINT_EARLY_RES + printk(KERN_INFO "Subtract (%d early reservations)\n", count); +#endif + for (i = idx; i < count; i++) { + struct early_res *r = &early_res[i]; +#if DEBUG_PRINT_EARLY_RES + printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, + r->start, r->end, r->name); +#endif + final_start = PFN_DOWN(r->start); + final_end = PFN_UP(r->end); + if (final_start >= final_end) + continue; + subtract_range(range, az, final_start, final_end); + } + +} + +int __init get_free_all_memory_range(struct range **rangep, int nodeid) +{ + int i, count; + u64 start = 0, end; + u64 size; + u64 mem; + struct range *range; + int nr_range; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + count *= 2; + + size = sizeof(struct range) * count; + end = get_max_mapped(); +#ifdef MAX_DMA32_PFN + if (end > (MAX_DMA32_PFN << PAGE_SHIFT)) + start = MAX_DMA32_PFN << PAGE_SHIFT; +#endif + mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); + if (mem == -1ULL) + panic("can not find more space for range free"); + + range = __va(mem); + /* use early_node_map[] and early_res to get range array at first */ + memset(range, 0, size); + nr_range = 0; + + /* need to go over early_node_map to find out good range for node */ + nr_range = add_from_early_node_map(range, count, nr_range, nodeid); +#ifdef CONFIG_X86_32 + subtract_range(range, count, max_low_pfn, -1ULL); +#endif + subtract_early_res(range, count); + nr_range = clean_sort_range(range, count); + + /* need to clear it ? */ + if (nodeid == MAX_NUMNODES) { + memset(&early_res[0], 0, + sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + } + + *rangep = range; + return nr_range; +} +#else +void __init early_res_to_bootmem(u64 start, u64 end) +{ + int i, count; + u64 final_start, final_end; + int idx = 0; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + + printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", + count - idx, max_early_res, start, end); + for (i = idx; i < count; i++) { + struct early_res *r = &early_res[i]; + printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, + r->start, r->end, r->name); + final_start = max(start, r->start); + final_end = min(end, r->end); + if (final_start >= final_end) { + printk(KERN_CONT "\n"); + continue; + } + printk(KERN_CONT " ==> [%010llx - %010llx]\n", + final_start, final_end); + reserve_bootmem_generic(final_start, final_end - final_start, + BOOTMEM_DEFAULT); + } + /* clear them */ + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + early_res_count = 0; +} +#endif + +/* Check for already reserved areas */ +static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) +{ + int i; + u64 addr = *addrp; + int changed = 0; + struct early_res *r; +again: + i = find_overlapped_early(addr, addr + size); + r = &early_res[i]; + if (i < max_early_res && r->end) { + *addrp = addr = round_up(r->end, align); + changed = 1; + goto again; + } + return changed; +} + +/* Check for already reserved areas */ +static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) +{ + int i; + u64 addr = *addrp, last; + u64 size = *sizep; + int changed = 0; +again: + last = addr + size; + for (i = 0; i < max_early_res && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + if (last > r->start && addr < r->start) { + size = r->start - addr; + changed = 1; + goto again; + } + if (last > r->end && addr < r->end) { + addr = round_up(r->end, align); + size = last - addr; + changed = 1; + goto again; + } + if (last <= r->end && addr >= r->start) { + (*sizep)++; + return 0; + } + } + if (changed) { + *addrp = addr; + *sizep = size; + } + return changed; +} + +/* + * Find a free area with specified alignment in a specific range. + * only with the area.between start to end is active range from early_node_map + * so they are good as RAM + */ +u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, + u64 size, u64 align) +{ + u64 addr, last; + + addr = round_up(ei_start, align); + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + goto out; + while (bad_addr(&addr, size, align) && addr+size <= ei_last) + ; + last = addr + size; + if (last > ei_last) + goto out; + if (last > end) + goto out; + + return addr; + +out: + return -1ULL; +} + +u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, + u64 *sizep, u64 align) +{ + u64 addr, last; + + addr = round_up(ei_start, align); + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + goto out; + *sizep = ei_last - addr; + while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) + ; + last = addr + *sizep; + if (last > ei_last) + goto out; + + return addr; + +out: + return -1ULL; +} -- cgit v1.2.3-70-g09d2 From 0a832320f1bae6a4169bf683e201378f2437cfc1 Mon Sep 17 00:00:00 2001 From: "Justin P. Mattock" Date: Tue, 16 Feb 2010 15:17:29 -0800 Subject: x86: Add iMac9,1 to pci_reboot_dmi_table On the iMac9,1 /sbin/reboot results in a black mangled screen. Adding this DMI entry gets the machine to reboot cleanly as it should. Signed-off-by: Justin P. Mattock LKML-Reference: <1266362249-3337-1-git-send-email-justinmattock@gmail.com> Signed-off-by: H. Peter Anvin Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 704bddcdf64..8e1aac86b50 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -461,6 +461,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), }, }, + { /* Handle problems with rebooting on the iMac9,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac9,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), + }, + }, { } }; -- cgit v1.2.3-70-g09d2 From e7b8e675d9c71b868b66f62f725a948047514719 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 26 Jan 2010 04:40:03 -0500 Subject: tracing: Unify arch_syscall_addr() implementations Most implementations of arch_syscall_addr() are the same, so create a default version in common code and move the one piece that differs (the syscall table) to asm/syscall.h. New arch ports don't have to waste time copying & pasting this simple function. The s390/sparc versions need to be different, so document why. Signed-off-by: Mike Frysinger Acked-by: David S. Miller Acked-by: Paul Mundt Acked-by: Heiko Carstens Cc: Steven Rostedt LKML-Reference: <1264498803-17278-1-git-send-email-vapier@gentoo.org> Signed-off-by: Frederic Weisbecker --- Documentation/trace/ftrace-design.txt | 5 ++--- arch/s390/include/asm/syscall.h | 7 +++++++ arch/s390/kernel/ftrace.c | 10 ---------- arch/sh/include/asm/syscall.h | 2 ++ arch/sh/kernel/ftrace.c | 9 --------- arch/sparc/include/asm/syscall.h | 7 +++++++ arch/sparc/kernel/ftrace.c | 11 ----------- arch/x86/include/asm/syscall.h | 2 ++ arch/x86/kernel/ftrace.c | 10 ---------- include/linux/ftrace.h | 6 ++++++ kernel/trace/trace_syscalls.c | 5 +++++ 11 files changed, 31 insertions(+), 43 deletions(-) (limited to 'arch/x86/kernel') diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt index 239f14b2b55..99df1101d2a 100644 --- a/Documentation/trace/ftrace-design.txt +++ b/Documentation/trace/ftrace-design.txt @@ -218,11 +218,10 @@ HAVE_SYSCALL_TRACEPOINTS You need very few things to get the syscalls tracing in an arch. +- Support HAVE_ARCH_TRACEHOOK (see arch/Kconfig). - Have a NR_syscalls variable in that provides the number of syscalls supported by the arch. -- Implement arch_syscall_addr() that resolves a syscall address from a - syscall number. -- Support the TIF_SYSCALL_TRACEPOINT thread flags +- Support the TIF_SYSCALL_TRACEPOINT thread flags. - Put the trace_sys_enter() and trace_sys_exit() tracepoints calls from ptrace in the ptrace syscalls tracing path. - Tag this arch as HAVE_SYSCALL_TRACEPOINTS. diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h index e0a73d3eb83..8429686951f 100644 --- a/arch/s390/include/asm/syscall.h +++ b/arch/s390/include/asm/syscall.h @@ -15,6 +15,13 @@ #include #include +/* + * The syscall table always contains 32 bit pointers since we know that the + * address of the function to be called is (way) below 4GB. So the "int" + * type here is what we want [need] for both 32 bit and 64 bit systems. + */ +extern const unsigned int sys_call_table[]; + static inline long syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 5a82bc68193..9e69449e77a 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -200,13 +200,3 @@ out: return parent; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -#ifdef CONFIG_FTRACE_SYSCALLS - -extern unsigned int sys_call_table[]; - -unsigned long __init arch_syscall_addr(int nr) -{ - return (unsigned long)sys_call_table[nr]; -} -#endif diff --git a/arch/sh/include/asm/syscall.h b/arch/sh/include/asm/syscall.h index 6a381429ee9..aa7777bdc37 100644 --- a/arch/sh/include/asm/syscall.h +++ b/arch/sh/include/asm/syscall.h @@ -1,6 +1,8 @@ #ifndef __ASM_SH_SYSCALL_H #define __ASM_SH_SYSCALL_H +extern const unsigned long sys_call_table[]; + #ifdef CONFIG_SUPERH32 # include "syscall_32.h" #else diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c index a48cdedc73b..30e13196d35 100644 --- a/arch/sh/kernel/ftrace.c +++ b/arch/sh/kernel/ftrace.c @@ -399,12 +399,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -#ifdef CONFIG_FTRACE_SYSCALLS -extern unsigned long *sys_call_table; - -unsigned long __init arch_syscall_addr(int nr) -{ - return (unsigned long)sys_call_table[nr]; -} -#endif /* CONFIG_FTRACE_SYSCALLS */ diff --git a/arch/sparc/include/asm/syscall.h b/arch/sparc/include/asm/syscall.h index 7486c605e23..025a02ad2e3 100644 --- a/arch/sparc/include/asm/syscall.h +++ b/arch/sparc/include/asm/syscall.h @@ -5,6 +5,13 @@ #include #include +/* + * The syscall table always contains 32 bit pointers since we know that the + * address of the function to be called is (way) below 4GB. So the "int" + * type here is what we want [need] for both 32 bit and 64 bit systems. + */ +extern const unsigned int sys_call_table[]; + /* The system call number is given by the user in %g1 */ static inline long syscall_get_nr(struct task_struct *task, struct pt_regs *regs) diff --git a/arch/sparc/kernel/ftrace.c b/arch/sparc/kernel/ftrace.c index 29973daa993..9103a56b39e 100644 --- a/arch/sparc/kernel/ftrace.c +++ b/arch/sparc/kernel/ftrace.c @@ -91,14 +91,3 @@ int __init ftrace_dyn_arch_init(void *data) return 0; } #endif - -#ifdef CONFIG_FTRACE_SYSCALLS - -extern unsigned int sys_call_table[]; - -unsigned long __init arch_syscall_addr(int nr) -{ - return (unsigned long)sys_call_table[nr]; -} - -#endif diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 8d33bc5462d..c4a348f7bd4 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -16,6 +16,8 @@ #include #include +extern const unsigned long sys_call_table[]; + /* * Only the low 32 bits of orig_ax are meaningful, so we return int. * This importantly ignores the high bits on 64-bit, so comparisons diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 30968924543..0d93a941934 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -484,13 +484,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - -#ifdef CONFIG_FTRACE_SYSCALLS - -extern unsigned long *sys_call_table; - -unsigned long __init arch_syscall_addr(int nr) -{ - return (unsigned long)(&sys_call_table)[nr]; -} -#endif diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0b4f97d24d7..1cbb36f2759 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -511,4 +511,10 @@ static inline void trace_hw_branch_oops(void) {} #endif /* CONFIG_HW_BRANCH_TRACER */ +#ifdef CONFIG_FTRACE_SYSCALLS + +unsigned long arch_syscall_addr(int nr); + +#endif /* CONFIG_FTRACE_SYSCALLS */ + #endif /* _LINUX_FTRACE_H */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 49cea70fbf6..ecf00782b46 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -394,6 +394,11 @@ int init_syscall_trace(struct ftrace_event_call *call) return id; } +unsigned long __init arch_syscall_addr(int nr) +{ + return (unsigned long)sys_call_table[nr]; +} + int __init init_ftrace_syscalls(void) { struct syscall_metadata *meta; -- cgit v1.2.3-70-g09d2 From 6738762d73a237ec322b04d8b9d55c8fd5d84713 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:36 -0800 Subject: x86, irq: Remove arch_probe_nr_irqs So keep nr_irqs == NR_IRQS. With radix trees is matters less. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-33-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index f5e40339622..c64ddd9d997 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3826,28 +3826,6 @@ void __init probe_nr_irqs_gsi(void) printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } -#ifdef CONFIG_SPARSE_IRQ -int __init arch_probe_nr_irqs(void) -{ - int nr; - - if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) - nr_irqs = NR_VECTORS * nr_cpu_ids; - - nr = nr_irqs_gsi + 8 * nr_cpu_ids; -#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) - /* - * for MSI and HT dyn irq - */ - nr += nr_irqs_gsi * 16; -#endif - if (nr < nr_irqs) - nr_irqs = nr; - - return 0; -} -#endif - static int __io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { -- cgit v1.2.3-70-g09d2 From 2b633e3fac5efada088b57d31e65401f22bcc18f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:37 -0800 Subject: smp: Use nr_cpus= to set nr_cpu_ids early On x86, before prefill_possible_map(), nr_cpu_ids will be NR_CPUS aka CONFIG_NR_CPUS. Add nr_cpus= to set nr_cpu_ids. so we can simulate cpus <=8 are installed on normal config. -v2: accordging to Christoph, acpi_numa_init should use nr_cpu_ids in stead of NR_CPUS. -v3: add doc in kernel-parameters.txt according to Andrew. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-34-git-send-email-yinghai@kernel.org> Acked-by: Linus Torvalds Signed-off-by: H. Peter Anvin Cc: Tony Luck --- Documentation/kernel-parameters.txt | 6 ++++++ arch/ia64/kernel/acpi.c | 4 ++-- arch/x86/kernel/smpboot.c | 7 ++++--- drivers/acpi/numa.c | 4 ++-- init/main.c | 14 ++++++++++++++ 5 files changed, 28 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 736d4560288..51bceb0fb27 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1772,6 +1772,12 @@ and is between 256 and 4096 characters. It is defined in the file purges which is reported from either PAL_VM_SUMMARY or SAL PALO. + nr_cpus= [SMP] Maximum number of processors that an SMP kernel + could support. nr_cpus=n : n >= 1 limits the kernel to + supporting 'n' processors. Later in runtime you can not + use hotplug cpu feature to put more cpu back to online. + just like you compile the kernel NR_CPUS=n + nr_uarts= [SERIAL] maximum number of UARTs to be registered. numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 40574ae1140..605a08b2972 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -881,8 +881,8 @@ __init void prefill_possible_map(void) possible = available_cpus + additional_cpus; - if (possible > NR_CPUS) - possible = NR_CPUS; + if (possible > nr_cpu_ids) + possible = nr_cpu_ids; printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", possible, max((possible - available_cpus), 0)); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 678d0b8c26f..eff2fe17542 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1213,11 +1213,12 @@ __init void prefill_possible_map(void) total_cpus = max_t(int, possible, num_processors + disabled_cpus); - if (possible > CONFIG_NR_CPUS) { + /* nr_cpu_ids could be reduced via nr_cpus= */ + if (possible > nr_cpu_ids) { printk(KERN_WARNING "%d Processors exceeds NR_CPUS limit of %d\n", - possible, CONFIG_NR_CPUS); - possible = CONFIG_NR_CPUS; + possible, nr_cpu_ids); + possible = nr_cpu_ids; } printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 7ad48dfc12d..b8725461d88 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -279,9 +279,9 @@ int __init acpi_numa_init(void) /* SRAT: Static Resource Affinity Table */ if (!acpi_table_parse(ACPI_SIG_SRAT, acpi_parse_srat)) { acpi_table_parse_srat(ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY, - acpi_parse_x2apic_affinity, NR_CPUS); + acpi_parse_x2apic_affinity, nr_cpu_ids); acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, - acpi_parse_processor_affinity, NR_CPUS); + acpi_parse_processor_affinity, nr_cpu_ids); ret = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, NR_NODE_MEMBLKS); diff --git a/init/main.c b/init/main.c index 845187822e5..05b5283e98f 100644 --- a/init/main.c +++ b/init/main.c @@ -149,6 +149,20 @@ static int __init nosmp(char *str) early_param("nosmp", nosmp); +/* this is hard limit */ +static int __init nrcpus(char *str) +{ + int nr_cpus; + + get_option(&str, &nr_cpus); + if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + nr_cpu_ids = nr_cpus; + + return 0; +} + +early_param("nr_cpus", nrcpus); + static int __init maxcpus(char *str) { get_option(&str, &setup_max_cpus); -- cgit v1.2.3-70-g09d2 From eb5b3794062824ba12d883901eea49ea89d0a678 Mon Sep 17 00:00:00 2001 From: Brandon Philips Date: Sun, 7 Feb 2010 13:02:50 -0800 Subject: x86, irq: Keep chip_data in create_irq_nr and destroy_irq Version 4: use get_irq_chip_data() in destroy_irq() to get rid of some local vars. When two drivers are setting up MSI-X at the same time via pci_enable_msix() there is a race. See this dmesg excerpt: [ 85.170610] ixgbe 0000:02:00.1: irq 97 for MSI/MSI-X [ 85.170611] alloc irq_desc for 99 on node -1 [ 85.170613] igb 0000:08:00.1: irq 98 for MSI/MSI-X [ 85.170614] alloc kstat_irqs on node -1 [ 85.170616] alloc irq_2_iommu on node -1 [ 85.170617] alloc irq_desc for 100 on node -1 [ 85.170619] alloc kstat_irqs on node -1 [ 85.170621] alloc irq_2_iommu on node -1 [ 85.170625] ixgbe 0000:02:00.1: irq 99 for MSI/MSI-X [ 85.170626] alloc irq_desc for 101 on node -1 [ 85.170628] igb 0000:08:00.1: irq 100 for MSI/MSI-X [ 85.170630] alloc kstat_irqs on node -1 [ 85.170631] alloc irq_2_iommu on node -1 [ 85.170635] alloc irq_desc for 102 on node -1 [ 85.170636] alloc kstat_irqs on node -1 [ 85.170639] alloc irq_2_iommu on node -1 [ 85.170646] BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 As you can see igb and ixgbe are both alternating on create_irq_nr() via pci_enable_msix() in their probe function. ixgbe: While looping through irq_desc_ptrs[] via create_irq_nr() ixgbe choses irq_desc_ptrs[102] and exits the loop, drops vector_lock and calls dynamic_irq_init. Then it sets irq_desc_ptrs[102]->chip_data = NULL via dynamic_irq_init(). igb: Grabs the vector_lock now and starts looping over irq_desc_ptrs[] via create_irq_nr(). It gets to irq_desc_ptrs[102] and does this: cfg_new = irq_desc_ptrs[102]->chip_data; if (cfg_new->vector != 0) continue; This hits the NULL deref. Another possible race exists via pci_disable_msix() in a driver or in the number of error paths that call free_msi_irqs(): destroy_irq() dynamic_irq_cleanup() which sets desc->chip_data = NULL ...race window... desc->chip_data = cfg; Remove the save and restore code for cfg in create_irq_nr() and destroy_irq() and take the desc->lock when checking the irq_cfg. Reported-and-analyzed-by: Brandon Philips Signed-off-by: Yinghai Lu LKML-Reference: <20100207210250.GB8256@jenkins.home.ifup.org> Signed-off-by: Brandon Phiilps Cc: stable@kernel.org Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 20 ++++------------ include/linux/irq.h | 2 ++ kernel/irq/chip.c | 52 ++++++++++++++++++++++++++++++++++-------- 3 files changed, 50 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 5e4cce254e4..e93a76bc867 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3278,12 +3278,9 @@ unsigned int create_irq_nr(unsigned int irq_want, int node) } spin_unlock_irqrestore(&vector_lock, flags); - if (irq > 0) { - dynamic_irq_init(irq); - /* restore it, in case dynamic_irq_init clear it */ - if (desc_new) - desc_new->chip_data = cfg_new; - } + if (irq > 0) + dynamic_irq_init_keep_chip_data(irq); + return irq; } @@ -3305,19 +3302,12 @@ int create_irq(void) void destroy_irq(unsigned int irq) { unsigned long flags; - struct irq_cfg *cfg; - struct irq_desc *desc; - /* store it, in case dynamic_irq_cleanup clear it */ - desc = irq_to_desc(irq); - cfg = desc->chip_data; - dynamic_irq_cleanup(irq); - /* connect back irq_cfg */ - desc->chip_data = cfg; + dynamic_irq_cleanup_keep_chip_data(irq); free_irte(irq); spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq, cfg); + __clear_irq_vector(irq, get_irq_chip_data(irq)); spin_unlock_irqrestore(&vector_lock, flags); } diff --git a/include/linux/irq.h b/include/linux/irq.h index 451481c082b..4d9b26e044b 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -400,7 +400,9 @@ static inline int irq_has_action(unsigned int irq) /* Dynamic irq helper functions */ extern void dynamic_irq_init(unsigned int irq); +void dynamic_irq_init_keep_chip_data(unsigned int irq); extern void dynamic_irq_cleanup(unsigned int irq); +void dynamic_irq_cleanup_keep_chip_data(unsigned int irq); /* Set/get chip/data for an IRQ: */ extern int set_irq_chip(unsigned int irq, struct irq_chip *chip); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ecc3fa28f66..d70394f12ee 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -18,11 +18,7 @@ #include "internals.h" -/** - * dynamic_irq_init - initialize a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_init(unsigned int irq) +static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) { struct irq_desc *desc; unsigned long flags; @@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq) desc->depth = 1; desc->msi_desc = NULL; desc->handler_data = NULL; - desc->chip_data = NULL; + if (!keep_chip_data) + desc->chip_data = NULL; desc->action = NULL; desc->irq_count = 0; desc->irqs_unhandled = 0; @@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq) } /** - * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * dynamic_irq_init - initialize a dynamically allocated irq * @irq: irq number to initialize */ -void dynamic_irq_cleanup(unsigned int irq) +void dynamic_irq_init(unsigned int irq) +{ + dynamic_irq_init_x(irq, false); +} + +/** + * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq + * @irq: irq number to initialize + * + * does not set irq_to_desc(irq)->chip_data to NULL + */ +void dynamic_irq_init_keep_chip_data(unsigned int irq) +{ + dynamic_irq_init_x(irq, true); +} + +static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq) } desc->msi_desc = NULL; desc->handler_data = NULL; - desc->chip_data = NULL; + if (!keep_chip_data) + desc->chip_data = NULL; desc->handle_irq = handle_bad_irq; desc->chip = &no_irq_chip; desc->name = NULL; @@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq) raw_spin_unlock_irqrestore(&desc->lock, flags); } +/** + * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * @irq: irq number to initialize + */ +void dynamic_irq_cleanup(unsigned int irq) +{ + dynamic_irq_cleanup_x(irq, false); +} + +/** + * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq + * @irq: irq number to initialize + * + * does not set irq_to_desc(irq)->chip_data to NULL + */ +void dynamic_irq_cleanup_keep_chip_data(unsigned int irq) +{ + dynamic_irq_cleanup_x(irq, true); +} + /** * set_irq_chip - set the irq chip for an irq -- cgit v1.2.3-70-g09d2 From f619b3d8427eb57f0134dab75b0d217325c72411 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 4 Feb 2010 12:09:07 +0100 Subject: x86, cacheinfo: Remove NUMA dependency, fix for AMD Fam10h rev D1 The show/store_cache_disable routines depend unnecessarily on NUMA's cpu_to_node and the disabling of cache indices broke when !CONFIG_NUMA. Remove that dependency by using a helper which is always correct. While at it, enable L3 Cache Index disable on rev D1 Istanbuls which sport the feature too. Signed-off-by: Borislav Petkov LKML-Reference: <20100218184339.GG20473@aftab> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 589b705e80e..be5f5c28ddf 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -327,7 +327,7 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) /* see errata #382 and #388 */ if ((boot_cpu_data.x86 == 0x10) && - ((boot_cpu_data.x86_model < 0x9) || + ((boot_cpu_data.x86_model < 0x8) || (boot_cpu_data.x86_mask < 0x1))) return; @@ -744,7 +744,7 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, unsigned int index) { int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = cpu_to_node(cpu); + int node = amd_get_nb_id(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned int reg = 0; @@ -771,7 +771,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, size_t count, unsigned int index) { int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = cpu_to_node(cpu); + int node = amd_get_nb_id(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned long val = 0; -- cgit v1.2.3-70-g09d2 From cb19060abfdecac0d1eb2d2f0e7d6b7a3f8bc4f4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 18 Feb 2010 19:37:14 +0100 Subject: x86, cacheinfo: Enable L3 CID only on AMD Final stage linking can fail with arch/x86/built-in.o: In function `store_cache_disable': intel_cacheinfo.c:(.text+0xc509): undefined reference to `amd_get_nb_id' arch/x86/built-in.o: In function `show_cache_disable': intel_cacheinfo.c:(.text+0xc7d3): undefined reference to `amd_get_nb_id' when CONFIG_CPU_SUP_AMD is not enabled because the amd_get_nb_id helper is defined in AMD-specific code but also used in generic code (intel_cacheinfo.c). Reorganize the L3 cache index disable code under CONFIG_CPU_SUP_AMD since it is AMD-only anyway. Signed-off-by: Borislav Petkov LKML-Reference: <20100218184210.GF20473@aftab> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/intel_cacheinfo.c | 186 ++++++++++++++++++---------------- 1 file changed, 98 insertions(+), 88 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index be5f5c28ddf..d440123c556 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -293,6 +293,13 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, (ebx->split.ways_of_associativity + 1) - 1; } +struct _cache_attr { + struct attribute attr; + ssize_t (*show)(struct _cpuid4_info *, char *); + ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); +}; + +#ifdef CONFIG_CPU_SUP_AMD static unsigned int __cpuinit amd_calc_l3_indices(void) { /* @@ -303,7 +310,7 @@ static unsigned int __cpuinit amd_calc_l3_indices(void) int node = cpu_to_node(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned int sc0, sc1, sc2, sc3; - u32 val; + u32 val = 0; pci_read_config_dword(dev, 0x1C4, &val); @@ -335,6 +342,94 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) this_leaf->l3_indices = amd_calc_l3_indices(); } +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int index) +{ + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = amd_get_nb_id(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int reg = 0; + + if (!this_leaf->can_disable) + return -EINVAL; + + if (!dev) + return -EINVAL; + + pci_read_config_dword(dev, 0x1BC + index * 4, ®); + return sprintf(buf, "0x%08x\n", reg); +} + +#define SHOW_CACHE_DISABLE(index) \ +static ssize_t \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +{ \ + return show_cache_disable(this_leaf, buf, index); \ +} +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, unsigned int index) +{ + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = amd_get_nb_id(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned long val = 0; + +#define SUBCACHE_MASK (3UL << 20) +#define SUBCACHE_INDEX 0xfff + + if (!this_leaf->can_disable) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dev) + return -EINVAL; + + if (strict_strtoul(buf, 10, &val) < 0) + return -EINVAL; + + /* do not allow writes outside of allowed bits */ + if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || + ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) + return -EINVAL; + + val |= BIT(30); + pci_write_config_dword(dev, 0x1BC + index * 4, val); + /* + * We need to WBINVD on a core on the node containing the L3 cache which + * indices we disable therefore a simple wbinvd() is not sufficient. + */ + wbinvd_on_cpu(cpu); + pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31)); + return count; +} + +#define STORE_CACHE_DISABLE(index) \ +static ssize_t \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ + const char *buf, size_t count) \ +{ \ + return store_cache_disable(this_leaf, buf, count, index); \ +} +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, + show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, + show_cache_disable_1, store_cache_disable_1); + +#else /* CONFIG_CPU_SUP_AMD */ +static void __cpuinit +amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +{ +}; +#endif /* CONFIG_CPU_SUP_AMD */ + static int __cpuinit cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) @@ -740,88 +835,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, - unsigned int index) -{ - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = amd_get_nb_id(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); - unsigned int reg = 0; - - if (!this_leaf->can_disable) - return -EINVAL; - - if (!dev) - return -EINVAL; - - pci_read_config_dword(dev, 0x1BC + index * 4, ®); - return sprintf(buf, "0x%08x\n", reg); -} - -#define SHOW_CACHE_DISABLE(index) \ -static ssize_t \ -show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ -{ \ - return show_cache_disable(this_leaf, buf, index); \ -} -SHOW_CACHE_DISABLE(0) -SHOW_CACHE_DISABLE(1) - -static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, - const char *buf, size_t count, unsigned int index) -{ - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = amd_get_nb_id(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); - unsigned long val = 0; - -#define SUBCACHE_MASK (3UL << 20) -#define SUBCACHE_INDEX 0xfff - - if (!this_leaf->can_disable) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!dev) - return -EINVAL; - - if (strict_strtoul(buf, 10, &val) < 0) - return -EINVAL; - - /* do not allow writes outside of allowed bits */ - if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || - ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) - return -EINVAL; - - val |= BIT(30); - pci_write_config_dword(dev, 0x1BC + index * 4, val); - /* - * We need to WBINVD on a core on the node containing the L3 cache which - * indices we disable therefore a simple wbinvd() is not sufficient. - */ - wbinvd_on_cpu(cpu); - pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31)); - return count; -} - -#define STORE_CACHE_DISABLE(index) \ -static ssize_t \ -store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count) \ -{ \ - return store_cache_disable(this_leaf, buf, count, index); \ -} -STORE_CACHE_DISABLE(0) -STORE_CACHE_DISABLE(1) - -struct _cache_attr { - struct attribute attr; - ssize_t (*show)(struct _cpuid4_info *, char *); - ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); -}; - #define define_one_ro(_name) \ static struct _cache_attr _name = \ __ATTR(_name, 0444, show_##_name, NULL) @@ -836,11 +849,6 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, - show_cache_disable_0, store_cache_disable_0); -static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, - show_cache_disable_1, store_cache_disable_1); - #define DEFAULT_SYSFS_CACHE_ATTRS \ &type.attr, \ &level.attr, \ @@ -859,8 +867,10 @@ static struct attribute *default_attrs[] = { static struct attribute *default_l3_attrs[] = { DEFAULT_SYSFS_CACHE_ATTRS, +#ifdef CONFIG_CPU_SUP_AMD &cache_disable_0.attr, &cache_disable_1.attr, +#endif NULL }; -- cgit v1.2.3-70-g09d2 From b72d0db9dd41da1f2ec6274b03e8909583c64e41 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 29 Aug 2009 16:24:51 +0200 Subject: x86: Move pci init function to x86_init The PCI initialization in pci_subsys_init() is a mess. pci_numaq_init, pci_acpi_init, pci_visws_init and pci_legacy_init are called and each implementation checks and eventually modifies the global variable pcibios_scanned. x86_init functions allow us to do this more elegant. The pci.init function pointer is preset to pci_legacy_init. numaq, acpi and visws can modify the pointer in their early setup functions. The functions return 0 when they did the full initialization including bus scan. A non zero return value indicates that pci_legacy_init needs to be called either because the selected function failed or wants the generic bus scan in pci_legacy_init to happen (e.g. visws). Signed-off-by: Thomas Gleixner LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80CFE@orsmsx508.amr.corp.intel.com> Acked-by: Jesse Barnes Signed-off-by: Jacob Pan Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/numaq.h | 1 + arch/x86/include/asm/pci.h | 9 ++++++++- arch/x86/include/asm/pci_x86.h | 14 +++++++++++--- arch/x86/include/asm/setup.h | 2 -- arch/x86/include/asm/visws/cobalt.h | 2 ++ arch/x86/include/asm/x86_init.h | 9 +++++++++ arch/x86/kernel/acpi/boot.c | 4 ++++ arch/x86/kernel/apic/numaq_32.c | 1 + arch/x86/kernel/visws_quirks.c | 6 +----- arch/x86/kernel/x86_init.c | 5 +++++ arch/x86/pci/acpi.c | 6 +----- arch/x86/pci/common.c | 6 ------ arch/x86/pci/legacy.c | 22 ++++++++-------------- arch/x86/pci/numaq_32.c | 6 ------ arch/x86/pci/visws.c | 6 ++---- 15 files changed, 53 insertions(+), 46 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h index 9f0a5f5d29e..ef6bf81f846 100644 --- a/arch/x86/include/asm/numaq.h +++ b/arch/x86/include/asm/numaq.h @@ -30,6 +30,7 @@ extern int found_numaq; extern int get_memcfg_numaq(void); +extern int pci_numaq_init(void); extern void *xquad_portio; diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index ada8c201d51..8bd433ccc24 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -45,8 +45,15 @@ static inline int pci_proc_domain(struct pci_bus *bus) #ifdef CONFIG_PCI extern unsigned int pcibios_assign_all_busses(void); +extern int pci_legacy_init(void); +# ifdef CONFIG_ACPI +# define x86_default_pci_init pci_acpi_init +# else +# define x86_default_pci_init pci_legacy_init +# endif #else -#define pcibios_assign_all_busses() 0 +# define pcibios_assign_all_busses() 0 +# define x86_default_pci_init NULL #endif extern unsigned long pci_mem_start; diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index b4bf9a942ed..440124f1224 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -82,7 +82,6 @@ struct irq_routing_table { extern unsigned int pcibios_irq_mask; -extern int pcibios_scanned; extern spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); @@ -112,9 +111,8 @@ extern void __init dmi_check_skip_isa_align(void); /* some common used subsys_initcalls */ extern int __init pci_acpi_init(void); extern int __init pcibios_irq_init(void); -extern int __init pci_visws_init(void); -extern int __init pci_numaq_init(void); extern int __init pcibios_init(void); +extern int pci_legacy_init(void); /* pci-mmconfig.c */ @@ -182,3 +180,13 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val) { asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory"); } + +#ifdef CONFIG_PCI +# ifdef CONFIG_ACPI +# define x86_default_pci_init pci_acpi_init +# else +# define x86_default_pci_init pci_legacy_init +# endif +#else +# define x86_default_pci_init NULL +#endif diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 18e496c98ff..86b1506f417 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -37,10 +37,8 @@ void setup_bios_corruption_check(void); #ifdef CONFIG_X86_VISWS extern void visws_early_detect(void); -extern int is_visws_box(void); #else static inline void visws_early_detect(void) { } -static inline int is_visws_box(void) { return 0; } #endif extern unsigned long saved_video_mode; diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h index 166adf61e77..2edb37637ea 100644 --- a/arch/x86/include/asm/visws/cobalt.h +++ b/arch/x86/include/asm/visws/cobalt.h @@ -122,4 +122,6 @@ extern char visws_board_type; extern char visws_board_rev; +extern int pci_visws_init(void); + #endif /* _ASM_X86_VISWS_COBALT_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index ea0e8ea15e1..f145d843f03 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -98,6 +98,14 @@ struct x86_init_iommu { int (*iommu_init)(void); }; + /* + * struct x86_init_pci - platform specific pci init functions + * @init: platform specific pci init + */ +struct x86_init_pci { + int (*init)(void); +}; + /** * struct x86_init_ops - functions for platform specific setup * @@ -110,6 +118,7 @@ struct x86_init_ops { struct x86_init_paging paging; struct x86_init_timers timers; struct x86_init_iommu iommu; + struct x86_init_pci pci; }; /** diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0acbcdfa5ca..054a5f5548b 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -1604,6 +1605,9 @@ int __init acpi_boot_init(void) acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); + if (!acpi_noirq) + x86_init.pci.init = pci_acpi_init; + return 0; } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 98c4665f251..be5c4fd4777 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -277,6 +277,7 @@ static __init void early_check_numaq(void) x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; x86_init.timers.tsc_pre_init = numaq_tsc_init; + x86_init.pci.init = pci_numaq_init; } } diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 34a279a7471..843e9d30a1e 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -49,11 +49,6 @@ extern int no_broadcast; char visws_board_type = -1; char visws_board_rev = -1; -int is_visws_box(void) -{ - return visws_board_type >= 0; -} - static void __init visws_time_init(void) { printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -242,6 +237,7 @@ void __init visws_early_detect(void) x86_init.irqs.pre_vector_init = visws_pre_intr_init; x86_init.irqs.trap_init = visws_trap_init; x86_init.timers.timer_init = visws_time_init; + x86_init.pci.init = pci_visws_init; /* * Install reboot quirks: diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ccd179dec36..81faa6d67d6 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -70,6 +71,10 @@ struct x86_init_ops x86_init __initdata = { .iommu = { .iommu_init = iommu_init_noop, }, + + .pci = { + .init = x86_default_pci_init, + }, }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 959e548a703..73b3fe9aa71 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -282,15 +282,11 @@ int __init pci_acpi_init(void) { struct pci_dev *dev = NULL; - if (pcibios_scanned) - return 0; - if (acpi_noirq) - return 0; + return -ENODEV; printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n"); acpi_irq_penalty_init(); - pcibios_scanned++; pcibios_enable_irq = acpi_pci_irq_enable; pcibios_disable_irq = acpi_pci_irq_disable; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index d2552c68e94..f5770b5846a 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -71,12 +71,6 @@ struct pci_ops pci_root_ops = { .write = pci_write, }; -/* - * legacy, numa, and acpi all want to call pcibios_scan_root - * from their initcalls. This flag prevents that. - */ -int pcibios_scanned; - /* * This interrupt-safe spinlock protects all accesses to PCI * configuration space. diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 4061bb0f267..0daf264ddb6 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -35,16 +35,13 @@ static void __devinit pcibios_fixup_peer_bridges(void) } } -static int __init pci_legacy_init(void) +int __init pci_legacy_init(void) { if (!raw_pci_ops) { printk("PCI: System does not support PCI\n"); return 0; } - if (pcibios_scanned++) - return 0; - printk("PCI: Probing PCI hardware\n"); pci_root_bus = pcibios_scan_root(0); if (pci_root_bus) @@ -55,16 +52,13 @@ static int __init pci_legacy_init(void) int __init pci_subsys_init(void) { -#ifdef CONFIG_X86_NUMAQ - pci_numaq_init(); -#endif -#ifdef CONFIG_ACPI - pci_acpi_init(); -#endif -#ifdef CONFIG_X86_VISWS - pci_visws_init(); -#endif - pci_legacy_init(); + /* + * The init function returns an non zero value when + * pci_legacy_init should be invoked. + */ + if (x86_init.pci.init()) + pci_legacy_init(); + pcibios_fixup_peer_bridges(); pcibios_irq_init(); pcibios_init(); diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 8eb295e116f..45c0c9e4590 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -152,14 +152,8 @@ int __init pci_numaq_init(void) { int quad; - if (!found_numaq) - return 0; - raw_pci_ops = &pci_direct_conf1_mq; - if (pcibios_scanned++) - return 0; - pci_root_bus = pcibios_scan_root(0); if (pci_root_bus) pci_bus_add_devices(pci_root_bus); diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index bcead7a4687..03008f72eb0 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -69,9 +69,6 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq) int __init pci_visws_init(void) { - if (!is_visws_box()) - return -1; - pcibios_enable_irq = &pci_visws_enable_irq; pcibios_disable_irq = &pci_visws_disable_irq; @@ -90,5 +87,6 @@ int __init pci_visws_init(void) pci_scan_bus_with_sysdata(pci_bus1); pci_fixup_irqs(pci_common_swizzle, visws_map_irq); pcibios_resource_survey(); - return 0; + /* Request bus scan */ + return 1; } -- cgit v1.2.3-70-g09d2 From ab3b37937e8f4fb38dc9780b7bc3fd3c5195cca3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 29 Aug 2009 17:47:33 +0200 Subject: x86: Add pci_init_irq to x86_init Moorestown wants to reuse pcibios_init_irq but needs to provide its own implementation of pci_enable_irq. After we distangled the init we can move the init_irq call to x86_init and remove the pci_enable_irq != NULL check in pcibios_init_irq. pci_enable_irq is compile time initialized to pirq_enable_irq and the special cases which override it (visws and acpi) set the x86_init function pointer to noop. That allows MSRT to override pci_enable_irq and otherwise run pcibios_init_irq unmodified. Signed-off-by: Thomas Gleixner LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80CFF@orsmsx508.amr.corp.intel.com> Acked-by: Jesse Barnes Signed-off-by: Jacob Pan Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pci_x86.h | 4 +++- arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/visws_quirks.c | 1 + arch/x86/kernel/x86_init.c | 1 + arch/x86/pci/acpi.c | 1 + arch/x86/pci/irq.c | 12 ++++-------- arch/x86/pci/legacy.c | 2 +- 7 files changed, 13 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 440124f1224..46511c5be45 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -110,7 +110,7 @@ extern void __init dmi_check_skip_isa_align(void); /* some common used subsys_initcalls */ extern int __init pci_acpi_init(void); -extern int __init pcibios_irq_init(void); +extern void __init pcibios_irq_init(void); extern int __init pcibios_init(void); extern int pci_legacy_init(void); @@ -187,6 +187,8 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val) # else # define x86_default_pci_init pci_legacy_init # endif +# define x86_default_pci_init_irq pcibios_irq_init #else # define x86_default_pci_init NULL +# define x86_default_pci_init_irq NULL #endif diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index f145d843f03..34f61cd56f3 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -101,9 +101,11 @@ struct x86_init_iommu { /* * struct x86_init_pci - platform specific pci init functions * @init: platform specific pci init + * @init_irq: platform specific pci irq init */ struct x86_init_pci { int (*init)(void); + void (*init_irq)(void); }; /** diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 843e9d30a1e..b48ef6c0d71 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -238,6 +238,7 @@ void __init visws_early_detect(void) x86_init.irqs.trap_init = visws_trap_init; x86_init.timers.timer_init = visws_time_init; x86_init.pci.init = pci_visws_init; + x86_init.pci.init_irq = x86_init_noop; /* * Install reboot quirks: diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 81faa6d67d6..203f26fb7f3 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -74,6 +74,7 @@ struct x86_init_ops x86_init __initdata = { .pci = { .init = x86_default_pci_init, + .init_irq = x86_default_pci_init_irq, }, }; diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 73b3fe9aa71..b53f0487e2d 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -289,6 +289,7 @@ int __init pci_acpi_init(void) acpi_irq_penalty_init(); pcibios_enable_irq = acpi_pci_irq_enable; pcibios_disable_irq = acpi_pci_irq_disable; + x86_init.pci.init_irq = x86_init_noop; if (pci_routeirq) { /* diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 0696d506c4a..0f40ff20dd6 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -53,7 +53,7 @@ struct irq_router_handler { int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device); }; -int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL; +int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq; void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL; /* @@ -1110,12 +1110,12 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = { { } }; -int __init pcibios_irq_init(void) +void __init pcibios_irq_init(void) { DBG(KERN_DEBUG "PCI: IRQ init\n"); - if (pcibios_enable_irq || raw_pci_ops == NULL) - return 0; + if (raw_pci_ops == NULL) + return; dmi_check_system(pciirq_dmi_table); @@ -1142,8 +1142,6 @@ int __init pcibios_irq_init(void) pirq_table = NULL; } - pcibios_enable_irq = pirq_enable_irq; - pcibios_fixup_irqs(); if (io_apic_assign_pci_irqs && pci_routeirq) { @@ -1157,8 +1155,6 @@ int __init pcibios_irq_init(void) for_each_pci_dev(dev) pirq_enable_irq(dev); } - - return 0; } static void pirq_penalize_isa_irq(int irq, int active) diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 0daf264ddb6..0db5eaf5456 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -60,7 +60,7 @@ int __init pci_subsys_init(void) pci_legacy_init(); pcibios_fixup_peer_bridges(); - pcibios_irq_init(); + x86_init.pci.init_irq(); pcibios_init(); return 0; -- cgit v1.2.3-70-g09d2 From 9325a28ce2fa7c597e5ed41455a06c30b82b5710 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 29 Aug 2009 17:51:26 +0200 Subject: x86: Add pcibios_fixup_irqs to x86_init Platforms like Moorestown want to override the pcibios_fixup_irqs default function. Add it to x86_init.pci. Signed-off-by: Thomas Gleixner LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D00@orsmsx508.amr.corp.intel.com> Acked-by: Jesse Barnes Signed-off-by: Jacob Pan Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/pci_x86.h | 3 +++ arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/x86_init.c | 2 ++ arch/x86/pci/irq.c | 4 ++-- 4 files changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 46511c5be45..6e69edfbf07 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -113,6 +113,7 @@ extern int __init pci_acpi_init(void); extern void __init pcibios_irq_init(void); extern int __init pcibios_init(void); extern int pci_legacy_init(void); +extern void pcibios_fixup_irqs(void); /* pci-mmconfig.c */ @@ -188,7 +189,9 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val) # define x86_default_pci_init pci_legacy_init # endif # define x86_default_pci_init_irq pcibios_irq_init +# define x86_default_pci_fixup_irqs pcibios_fixup_irqs #else # define x86_default_pci_init NULL # define x86_default_pci_init_irq NULL +# define x86_default_pci_fixup_irqs NULL #endif diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 34f61cd56f3..8ef56f21f9f 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -102,10 +102,12 @@ struct x86_init_iommu { * struct x86_init_pci - platform specific pci init functions * @init: platform specific pci init * @init_irq: platform specific pci irq init + * @fixup_irqs: platform specific pci irq fixup */ struct x86_init_pci { int (*init)(void); void (*init_irq)(void); + void (*fixup_irqs)(void); }; /** diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 203f26fb7f3..1817cd7a03f 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -4,6 +4,7 @@ * For licencing details see kernel-base/COPYING */ #include +#include #include #include @@ -75,6 +76,7 @@ struct x86_init_ops x86_init __initdata = { .pci = { .init = x86_default_pci_init, .init_irq = x86_default_pci_init_irq, + .fixup_irqs = x86_default_pci_fixup_irqs, }, }; diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 0f40ff20dd6..a60deb6e669 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1016,7 +1016,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) return 1; } -static void __init pcibios_fixup_irqs(void) +void __init pcibios_fixup_irqs(void) { struct pci_dev *dev = NULL; u8 pin; @@ -1142,7 +1142,7 @@ void __init pcibios_irq_init(void) pirq_table = NULL; } - pcibios_fixup_irqs(); + x86_init.pci.fixup_irqs(); if (io_apic_assign_pci_irqs && pci_routeirq) { struct pci_dev *dev = NULL; -- cgit v1.2.3-70-g09d2 From d39f6495f66616b637260405d0b5dc2656bc490e Mon Sep 17 00:00:00 2001 From: Alek Du Date: Mon, 7 Sep 2009 16:25:45 +0800 Subject: x86, ioapic: Improve handling of i8259A irq init Since we already track the number of legacy vectors by nr_legacy_irqs, we can avoid use static vector allocations -- we can use dynamic one. Signed-off-by: Alek Du LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D01@orsmsx508.amr.corp.intel.com> Signed-off-by: Jacob Pan Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 53243ca7816..75265ab83b1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -140,27 +140,10 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node) /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ #ifdef CONFIG_SPARSE_IRQ -static struct irq_cfg irq_cfgx[] = { +static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; #else -static struct irq_cfg irq_cfgx[NR_IRQS] = { +static struct irq_cfg irq_cfgx[NR_IRQS]; #endif - [0] = { .vector = IRQ0_VECTOR, }, - [1] = { .vector = IRQ1_VECTOR, }, - [2] = { .vector = IRQ2_VECTOR, }, - [3] = { .vector = IRQ3_VECTOR, }, - [4] = { .vector = IRQ4_VECTOR, }, - [5] = { .vector = IRQ5_VECTOR, }, - [6] = { .vector = IRQ6_VECTOR, }, - [7] = { .vector = IRQ7_VECTOR, }, - [8] = { .vector = IRQ8_VECTOR, }, - [9] = { .vector = IRQ9_VECTOR, }, - [10] = { .vector = IRQ10_VECTOR, }, - [11] = { .vector = IRQ11_VECTOR, }, - [12] = { .vector = IRQ12_VECTOR, }, - [13] = { .vector = IRQ13_VECTOR, }, - [14] = { .vector = IRQ14_VECTOR, }, - [15] = { .vector = IRQ15_VECTOR, }, -}; void __init io_apic_disable_legacy(void) { @@ -181,6 +164,8 @@ int __init arch_early_irq_init(void) node= cpu_to_node(boot_cpu_id); for (i = 0; i < count; i++) { + if (i < nr_legacy_irqs) + cfg[i].vector = IRQ0_VECTOR + i; desc = irq_to_desc(i); desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); -- cgit v1.2.3-70-g09d2 From 35f720c5930f689647d51ad77e2a8d6f0abf66c8 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 17 Sep 2009 07:36:43 -0700 Subject: x86: Initialize stack canary in secondary start Some secondary clockevent setup code needs to call request_irq, which will cause fake stack check failure in schedule() if voluntary preemption model is chosen. It is safe to have stack canary initialized here early, since start_secondary() does not return. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D02@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/smpboot.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b4e870cbdc6..3e6150d421e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -324,6 +325,9 @@ notrace static void __cpuinit start_secondary(void *unused) /* enable local interrupts */ local_irq_enable(); + /* to prevent fake stack check failure in clock setup */ + boot_init_stack_canary(); + x86_cpuinit.setup_percpu_clockev(); wmb(); -- cgit v1.2.3-70-g09d2 From ef3548668c02cc8c3922f4423f32b53e662811c6 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 9 Nov 2009 11:24:14 -0800 Subject: x86, pic: Introduce legacy_pic abstraction This patch makes i8259A like legacy programmable interrupt controller code into a driver so that legacy pic functions can be selected at runtime based on platform information, such as HW subarchitecure ID. Default structure of legacy_pic maintains the current code path for x86pc. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D03@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/i8259.h | 13 +++++++++++++ arch/x86/kernel/i8259.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 58d7091eeb1..e8a3e05c288 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -57,6 +57,19 @@ static inline void outb_pic(unsigned char value, unsigned int port) extern struct irq_chip i8259A_chip; +struct legacy_pic { + int nr_legacy_irqs; + struct irq_chip *chip; + void (*mask_all)(void); + void (*restore_mask)(void); + void (*init)(int auto_eoi); + int (*irq_pending)(unsigned int irq); + void (*make_irq)(unsigned int irq); +}; + +extern struct legacy_pic *legacy_pic; +extern struct legacy_pic null_legacy_pic; + extern void mask_8259A(void); extern void unmask_8259A(void); diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index df89102bef8..b80987ca33e 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -358,3 +358,46 @@ void init_8259A(int auto_eoi) spin_unlock_irqrestore(&i8259A_lock, flags); } +/* + * make i8259 a driver so that we can select pic functions at run time. the goal + * is to make x86 binary compatible among pc compatible and non-pc compatible + * platforms, such as x86 MID. + */ + +static void __init legacy_pic_noop(void) { }; +static void __init legacy_pic_uint_noop(unsigned int unused) { }; +static void __init legacy_pic_int_noop(int unused) { }; + +static struct irq_chip dummy_pic_chip = { + .name = "dummy pic", + .mask = legacy_pic_uint_noop, + .unmask = legacy_pic_uint_noop, + .disable = legacy_pic_uint_noop, + .mask_ack = legacy_pic_uint_noop, +}; +static int legacy_pic_irq_pending_noop(unsigned int irq) +{ + return 0; +} + +struct legacy_pic null_legacy_pic = { + .nr_legacy_irqs = 0, + .chip = &dummy_pic_chip, + .mask_all = legacy_pic_noop, + .restore_mask = legacy_pic_noop, + .init = legacy_pic_int_noop, + .irq_pending = legacy_pic_irq_pending_noop, + .make_irq = legacy_pic_uint_noop, +}; + +struct legacy_pic default_legacy_pic = { + .nr_legacy_irqs = NR_IRQS_LEGACY, + .chip = &i8259A_chip, + .mask_all = mask_8259A, + .restore_mask = unmask_8259A, + .init = init_8259A, + .irq_pending = i8259A_irq_pending, + .make_irq = make_8259A_irq, +}; + +struct legacy_pic *legacy_pic = &default_legacy_pic; -- cgit v1.2.3-70-g09d2 From b81bb373a7e832a43921356aa1291044d7f52fb1 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Mon, 9 Nov 2009 11:27:04 -0800 Subject: x86, pic: Make use of legacy_pic abstraction This patch replaces legacy PIC-related global variable and functions with the new legacy_pic abstraction. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D04@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/hw_irq.h | 7 ------ arch/x86/include/asm/i8259.h | 8 ------ arch/x86/kernel/apic/apic.c | 8 +++--- arch/x86/kernel/apic/io_apic.c | 57 ++++++++++++++++++++---------------------- arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/i8259.c | 21 ++++++++++------ arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/smpboot.c | 5 ++-- arch/x86/kernel/visws_quirks.c | 14 +++++++---- 9 files changed, 59 insertions(+), 65 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index eeac829a0f4..a929c9ede33 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -53,13 +53,6 @@ extern void threshold_interrupt(void); extern void call_function_interrupt(void); extern void call_function_single_interrupt(void); -/* PIC specific functions */ -extern void disable_8259A_irq(unsigned int irq); -extern void enable_8259A_irq(unsigned int irq); -extern int i8259A_irq_pending(unsigned int irq); -extern void make_8259A_irq(unsigned int irq); -extern void init_8259A(int aeoi); - /* IOAPIC */ #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs)) extern unsigned long io_apic_irqs; diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index e8a3e05c288..2832babd91f 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -26,11 +26,6 @@ extern unsigned int cached_irq_mask; extern spinlock_t i8259A_lock; -extern void init_8259A(int auto_eoi); -extern void enable_8259A_irq(unsigned int irq); -extern void disable_8259A_irq(unsigned int irq); -extern unsigned int startup_8259A_irq(unsigned int irq); - /* the PIC may need a careful delay on some platforms, hence specific calls */ static inline unsigned char inb_pic(unsigned int port) { @@ -70,7 +65,4 @@ struct legacy_pic { extern struct legacy_pic *legacy_pic; extern struct legacy_pic null_legacy_pic; -extern void mask_8259A(void); -extern void unmask_8259A(void); - #endif /* _ASM_X86_I8259_H */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index dfca210f6a1..94f22b12858 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1390,7 +1390,7 @@ void __init enable_IR_x2apic(void) } local_irq_save(flags); - mask_8259A(); + legacy_pic->mask_all(); mask_IO_APIC_setup(ioapic_entries); if (dmar_table_init_ret) @@ -1422,7 +1422,7 @@ void __init enable_IR_x2apic(void) nox2apic: if (!ret) /* IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - unmask_8259A(); + legacy_pic->restore_mask(); local_irq_restore(flags); out: @@ -2018,7 +2018,7 @@ static int lapic_resume(struct sys_device *dev) } mask_IO_APIC_setup(ioapic_entries); - mask_8259A(); + legacy_pic->mask_all(); } if (x2apic_mode) @@ -2062,7 +2062,7 @@ static int lapic_resume(struct sys_device *dev) if (intr_remapping_enabled) { reenable_intr_remapping(x2apic_mode); - unmask_8259A(); + legacy_pic->restore_mask(); restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 75265ab83b1..1704cd82db5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -94,8 +94,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; -/* Number of legacy interrupts */ -static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; /* GSI interrupts */ static int nr_irqs_gsi = NR_IRQS_LEGACY; @@ -147,7 +145,6 @@ static struct irq_cfg irq_cfgx[NR_IRQS]; void __init io_apic_disable_legacy(void) { - nr_legacy_irqs = 0; nr_irqs_gsi = 0; } @@ -164,13 +161,13 @@ int __init arch_early_irq_init(void) node= cpu_to_node(boot_cpu_id); for (i = 0; i < count; i++) { - if (i < nr_legacy_irqs) + if (i < legacy_pic->nr_legacy_irqs) cfg[i].vector = IRQ0_VECTOR + i; desc = irq_to_desc(i); desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); - if (i < nr_legacy_irqs) + if (i < legacy_pic->nr_legacy_irqs) cpumask_setall(cfg[i].domain); } @@ -850,7 +847,7 @@ static int __init find_isa_irq_apic(int irq, int type) */ static int EISA_ELCR(unsigned int irq) { - if (irq < nr_legacy_irqs) { + if (irq < legacy_pic->nr_legacy_irqs) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1446,8 +1443,8 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq } ioapic_register_intr(irq, desc, trigger); - if (irq < nr_legacy_irqs) - disable_8259A_irq(irq); + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->chip->mask(irq); ioapic_write_entry(apic_id, pin, entry); } @@ -1810,7 +1807,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1894,7 +1891,7 @@ void __init enable_IO_APIC(void) nr_ioapic_registers[apic] = reg_01.bits.entries+1; } - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; for(apic = 0; apic < nr_ioapics; apic++) { @@ -1951,7 +1948,7 @@ void disable_IO_APIC(void) */ clear_IO_APIC(); - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; /* @@ -2184,9 +2181,9 @@ static unsigned int startup_ioapic_irq(unsigned int irq) struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); - if (irq < nr_legacy_irqs) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) + if (irq < legacy_pic->nr_legacy_irqs) { + legacy_pic->chip->mask(irq); + if (legacy_pic->irq_pending(irq)) was_pending = 1; } cfg = irq_cfg(irq); @@ -2719,8 +2716,8 @@ static inline void init_IO_APIC_traps(void) * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < nr_legacy_irqs) - make_8259A_irq(irq); + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->make_irq(irq); else /* Strange. Oh, well.. */ desc->chip = &no_irq_chip; @@ -2877,7 +2874,7 @@ static inline void __init check_timer(void) /* * get/set the timer IRQ vector: */ - disable_8259A_irq(0); + legacy_pic->chip->mask(0); assign_irq_vector(0, cfg, apic->target_cpus()); /* @@ -2890,7 +2887,7 @@ static inline void __init check_timer(void) * automatically. */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); + legacy_pic->init(1); #ifdef CONFIG_X86_32 { unsigned int ver; @@ -2949,7 +2946,7 @@ static inline void __init check_timer(void) if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2972,14 +2969,14 @@ static inline void __init check_timer(void) */ replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); + legacy_pic->chip->mask(0); setup_nmi(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } goto out; } @@ -2987,7 +2984,7 @@ static inline void __init check_timer(void) * Cleanup, just in case ... */ local_irq_disable(); - disable_8259A_irq(0); + legacy_pic->chip->mask(0); clear_IO_APIC_pin(apic2, pin2); apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } @@ -3006,22 +3003,22 @@ static inline void __init check_timer(void) lapic_register_intr(0, desc); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } local_irq_disable(); - disable_8259A_irq(0); + legacy_pic->chip->mask(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as ExtINT IRQ...\n"); - init_8259A(0); - make_8259A_irq(0); + legacy_pic->init(0); + legacy_pic->make_irq(0); apic_write(APIC_LVT0, APIC_DM_EXTINT); unlock_ExtINT_logic(); @@ -3063,7 +3060,7 @@ void __init setup_IO_APIC(void) /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ - io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; + io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); /* @@ -3074,7 +3071,7 @@ void __init setup_IO_APIC(void) sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); - if (nr_legacy_irqs) + if (legacy_pic->nr_legacy_irqs) check_timer(); } @@ -3875,7 +3872,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq, /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= nr_legacy_irqs) { + if (irq >= legacy_pic->nr_legacy_irqs) { cfg = desc->chip_data; if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { printk(KERN_INFO "can not add pin %d for irq %d\n", diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 0159a69396c..3817739acee 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -177,7 +177,7 @@ int __init check_nmi_watchdog(void) error: if (nmi_watchdog == NMI_IO_APIC) { if (!timer_through_8259) - disable_8259A_irq(0); + legacy_pic->chip->mask(0); on_each_cpu(__acpi_nmi_disable, NULL, 1); } diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index b80987ca33e..1c790e75f7a 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -34,6 +34,12 @@ static int i8259A_auto_eoi; DEFINE_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); +static void mask_8259A(void); +static void unmask_8259A(void); +static void disable_8259A_irq(unsigned int irq); +static void enable_8259A_irq(unsigned int irq); +static void init_8259A(int auto_eoi); +static int i8259A_irq_pending(unsigned int irq); struct irq_chip i8259A_chip = { .name = "XT-PIC", @@ -63,7 +69,7 @@ unsigned int cached_irq_mask = 0xffff; */ unsigned long io_apic_irqs; -void disable_8259A_irq(unsigned int irq) +static void disable_8259A_irq(unsigned int irq) { unsigned int mask = 1 << irq; unsigned long flags; @@ -77,7 +83,7 @@ void disable_8259A_irq(unsigned int irq) spin_unlock_irqrestore(&i8259A_lock, flags); } -void enable_8259A_irq(unsigned int irq) +static void enable_8259A_irq(unsigned int irq) { unsigned int mask = ~(1 << irq); unsigned long flags; @@ -91,7 +97,7 @@ void enable_8259A_irq(unsigned int irq) spin_unlock_irqrestore(&i8259A_lock, flags); } -int i8259A_irq_pending(unsigned int irq) +static int i8259A_irq_pending(unsigned int irq) { unsigned int mask = 1<init(0); /* * 16 old-style INTA-cycle interrupts: diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3e6150d421e..f7a52f4a21a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -68,6 +68,7 @@ #include #include +#include #ifdef CONFIG_X86_32 u8 apicid_2_node[MAX_APICID]; @@ -287,9 +288,9 @@ notrace static void __cpuinit start_secondary(void *unused) check_tsc_sync_target(); if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); + legacy_pic->chip->mask(0); enable_NMI_through_LVT0(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index b48ef6c0d71..f067e9556a4 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -505,7 +505,7 @@ static struct irq_chip cobalt_irq_type = { */ static unsigned int startup_piix4_master_irq(unsigned int irq) { - init_8259A(0); + legacy_pic->init(0); return startup_cobalt_irq(irq); } @@ -529,9 +529,6 @@ static struct irq_chip piix4_master_irq_type = { static struct irq_chip piix4_virtual_irq_type = { .name = "PIIX4-virtual", - .shutdown = disable_8259A_irq, - .enable = enable_8259A_irq, - .disable = disable_8259A_irq, }; @@ -606,7 +603,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) handle_IRQ_event(realirq, desc->action); if (!(desc->status & IRQ_DISABLED)) - enable_8259A_irq(realirq); + legacy_pic->chip->unmask(realirq); return IRQ_HANDLED; @@ -625,6 +622,12 @@ static struct irqaction cascade_action = { .name = "cascade", }; +static inline void set_piix4_virtual_irq_type(void) +{ + piix4_virtual_irq_type.shutdown = i8259A_chip.mask; + piix4_virtual_irq_type.enable = i8259A_chip.unmask; + piix4_virtual_irq_type.disable = i8259A_chip.mask; +} void init_VISWS_APIC_irqs(void) { @@ -650,6 +653,7 @@ void init_VISWS_APIC_irqs(void) desc->chip = &piix4_master_irq_type; } else if (i < CO_IRQ_APIC0) { + set_piix4_virtual_irq_type(); desc->chip = &piix4_virtual_irq_type; } else if (IS_CO_APIC(i)) { -- cgit v1.2.3-70-g09d2 From 1f91233c26fd5f7d6525fd29b95e4b50ca7a3e88 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 5 Feb 2010 04:06:56 -0800 Subject: x86, apic: Remove ioapic_disable_legacy() The ioapic_disable_legacy() call is no longer needed for platforms do not have legacy pic. the legacy pic abstraction has taken care it automatically. This patch also initialize irq-related static variables based on information obtained from legacy_pic. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F0755A30A7660@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/io_apic.h | 2 -- arch/x86/kernel/apic/io_apic.c | 10 +++++----- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 7c7c16cde1f..84fdd511094 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -143,8 +143,6 @@ extern int noioapicreroute; /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ extern int timer_through_8259; -extern void io_apic_disable_legacy(void); - /* * If we use the IO-APIC for IRQ routing, disable automatic * assignment of PCI IRQ's. diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1704cd82db5..3592a72f3f0 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -143,11 +143,6 @@ static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; static struct irq_cfg irq_cfgx[NR_IRQS]; #endif -void __init io_apic_disable_legacy(void) -{ - nr_irqs_gsi = 0; -} - int __init arch_early_irq_init(void) { struct irq_cfg *cfg; @@ -156,6 +151,11 @@ int __init arch_early_irq_init(void) int node; int i; + if (!legacy_pic->nr_legacy_irqs) { + nr_irqs_gsi = 0; + io_apic_irqs = ~0UL; + } + cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); node= cpu_to_node(boot_cpu_id); -- cgit v1.2.3-70-g09d2 From ff7fbc72e0c3ef7e94a27a3a918fd09ec9a30204 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 22 Feb 2010 14:51:33 -0800 Subject: x86, ptrace: Simplify xstateregs_get() 48 bytes (bytes 464..511) of the xstateregs payload come from the kernel defined structure (xstate_fx_sw_bytes). Rest comes from the xstate regs structure in the thread struct. Instead of having multiple user_regset_copyout()'s, simplify the xstateregs_get() by first copying the SW bytes into the xstate regs structure in the thread structure and then using one user_regset_copyout() to copyout the xstateregs. Requested-by: Roland McGrath Signed-off-by: Suresh Siddha LKML-Reference: <20100222225240.494688491@sbs-t61.sc.intel.com> Acked-by: Roland McGrath Signed-off-by: H. Peter Anvin Cc: Oleg Nesterov --- arch/x86/kernel/i387.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 7a8a193b514..81e23bf12c1 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -243,34 +243,18 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, return ret; /* - * First copy the fxsave bytes 0..463. + * Copy the 48bytes defined by the software first into the xstate + * memory layout in the thread struct, so that we can copy the entire + * xstateregs to the user using one user_regset_copyout(). */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.xstate->xsave, 0, - offsetof(struct user_xstateregs, - i387.xstate_fx_sw)); - if (ret) - return ret; - - /* - * Copy the 48bytes defined by software. - */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - xstate_fx_sw_bytes, - offsetof(struct user_xstateregs, - i387.xstate_fx_sw), - offsetof(struct user_xstateregs, - xsave_hdr)); - if (ret) - return ret; + memcpy(&target->thread.xstate->fxsave.sw_reserved, + xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); /* - * Copy the rest of xstate memory layout. + * Copy the xstate memory layout. */ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.xstate->xsave.xsave_hdr, - offsetof(struct user_xstateregs, - xsave_hdr), -1); + &target->thread.xstate->xsave, 0, -1); return ret; } -- cgit v1.2.3-70-g09d2 From 6dbbe14f21368a45aedba7eab0221857b8ad8d16 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 22 Feb 2010 14:51:34 -0800 Subject: x86, ptrace: Remove set_stopped_child_used_math() in [x]fpregs_set init_fpu() already ensures that the used_math() is set for the stopped child. Remove the redundant set_stopped_child_used_math() in [x]fpregs_set() Reported-by: Oleg Nesterov Signed-off-by: Suresh Siddha LKML-Reference: <20100222225240.642169080@sbs-t61.sc.intel.com> Acked-by: Rolan McGrath Signed-off-by: H. Peter Anvin --- arch/x86/kernel/i387.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 81e23bf12c1..c01a2b846d4 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -209,8 +209,6 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - set_stopped_child_used_math(target); - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.xstate->fxsave, 0, -1); @@ -471,8 +469,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - set_stopped_child_used_math(target); - if (!HAVE_HWFP) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); -- cgit v1.2.3-70-g09d2 From 28a3c93d11212655ce0a9be977c405c703844164 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 23 Feb 2010 02:03:31 -0800 Subject: x86, pic: Fix section mismatch in legacy pic Move legacy_pic chip dummy functions out of init section as they might be referenced at run time. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F0755A318D3AA@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/i8259.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 9bac6817456..fb725ee15f5 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -371,9 +371,9 @@ static void init_8259A(int auto_eoi) * platforms, such as x86 MID. */ -static void __init legacy_pic_noop(void) { }; -static void __init legacy_pic_uint_noop(unsigned int unused) { }; -static void __init legacy_pic_int_noop(int unused) { }; +static void legacy_pic_noop(void) { }; +static void legacy_pic_uint_noop(unsigned int unused) { }; +static void legacy_pic_int_noop(int unused) { }; static struct irq_chip dummy_pic_chip = { .name = "dummy pic", -- cgit v1.2.3-70-g09d2 From 05ddafb17ad1a73c8bc333cb328bad46513e85e7 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 23 Sep 2009 07:20:23 -0700 Subject: x86, ioapic: Early enable ioapic for timer irq Moorestown platform needs apic ready early for the system timer irq which is delievered via ioapic. Should not impact other platforms. In the longer term, once ioapic setup is moved before late time init, we will not need this patch to do early apic enabling. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D07@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/io_apic.h | 1 + arch/x86/kernel/apic/io_apic.c | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 31dfb42d864..99a416ed16b 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -187,6 +187,7 @@ extern struct mp_ioapic_gsi mp_gsi_routing[]; int mp_find_ioapic(int gsi); int mp_find_ioapic_pin(int ioapic, int gsi); void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); +extern void __init pre_init_apic_IRQ0(void); #else /* !CONFIG_X86_IO_APIC */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b34854358ee..8c848b5877a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -4289,3 +4289,24 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) nr_ioapics++; } + +/* Enable IOAPIC early just for system timer */ +void __init pre_init_apic_IRQ0(void) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + + printk(KERN_INFO "Early APIC setup for system timer0\n"); +#ifndef CONFIG_SMP + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); +#endif + desc = irq_to_desc_alloc_node(0, 0); + + setup_local_APIC(); + + cfg = irq_cfg(0); + add_pin_to_irq_node(cfg, 0, 0, 0); + set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); + + setup_IO_APIC_irq(0, 0, 0, desc, 0, 0); +} -- cgit v1.2.3-70-g09d2 From 5b78b6724a405d4b649db53f7aac28c930c89640 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 12 Feb 2010 02:29:11 -0800 Subject: x86, mrst: Add dummy legacy pic to platform setup Moorestown has no legacy PIC; point it to the null legacy PIC. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D09@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/mrst.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 3b7078abc87..bdf9b17290c 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -12,6 +12,9 @@ #include #include +#include +#include +#include /* * Moorestown specific x86_init function overrides and early setup @@ -21,4 +24,6 @@ void __init x86_mrst_early_setup(void) { x86_init.resources.probe_roms = x86_init_noop; x86_init.resources.reserve_resources = x86_init_noop; + + legacy_pic = &null_legacy_pic; } -- cgit v1.2.3-70-g09d2 From af2730f6eefce24c4ef1dc3f8267d33626db81bc Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 12 Feb 2010 10:31:47 -0800 Subject: x86, mrst: Fill in PCI functions in x86_init layer This patch added Moorestown platform specific PCI init functions. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D0A@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mrst.h | 15 +++++++++++++++ arch/x86/kernel/mrst.c | 3 +++ 2 files changed, 18 insertions(+) create mode 100644 arch/x86/include/asm/mrst.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h new file mode 100644 index 00000000000..57a177a8e82 --- /dev/null +++ b/arch/x86/include/asm/mrst.h @@ -0,0 +1,15 @@ +/* + * mrst.h: Intel Moorestown platform specific setup code + * + * (C) Copyright 2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#ifndef _ASM_X86_MRST_H +#define _ASM_X86_MRST_H +extern int pci_mrst_init(void); + +#endif /* _ASM_X86_MRST_H */ diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index bdf9b17290c..98440e18919 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -25,5 +25,8 @@ void __init x86_mrst_early_setup(void) x86_init.resources.probe_roms = x86_init_noop; x86_init.resources.reserve_resources = x86_init_noop; + x86_init.pci.init = pci_mrst_init; + x86_init.pci.fixup_irqs = x86_init_noop; + legacy_pic = &null_legacy_pic; } -- cgit v1.2.3-70-g09d2 From 16ab5395856d8953ae3d81e81bd6a8c269a1bfd6 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 12 Feb 2010 03:08:30 -0800 Subject: x86, mrst: Add platform timer info parsing code Moorestown platform timer information is obtained from SFI FW tables. This patch parses SFI table then assign the irq information to mp_irqs. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D0B@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mrst.h | 2 + arch/x86/kernel/mrst.c | 110 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index 57a177a8e82..fa144f2dd25 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -12,4 +12,6 @@ #define _ASM_X86_MRST_H extern int pci_mrst_init(void); +#define SFI_MTMR_MAX_NUM 8 + #endif /* _ASM_X86_MRST_H */ diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 98440e18919..bb6e45c71dd 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -10,12 +10,122 @@ * of the License. */ #include +#include +#include +#include #include +#include +#include +#include +#include #include #include #include +static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; +static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; +int sfi_mtimer_num; + +static inline void assign_to_mp_irq(struct mpc_intsrc *m, + struct mpc_intsrc *mp_irq) +{ + memcpy(mp_irq, m, sizeof(struct mpc_intsrc)); +} + +static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq, + struct mpc_intsrc *m) +{ + return memcmp(mp_irq, m, sizeof(struct mpc_intsrc)); +} + +static void save_mp_irq(struct mpc_intsrc *m) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + if (!mp_irq_cmp(&mp_irqs[i], m)) + return; + } + + assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +/* parse all the mtimer info to a static mtimer array */ +static int __init sfi_parse_mtmr(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_timer_table_entry *pentry; + struct mpc_intsrc mp_irq; + int totallen; + + sb = (struct sfi_table_simple *)table; + if (!sfi_mtimer_num) { + sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb, + struct sfi_timer_table_entry); + pentry = (struct sfi_timer_table_entry *) sb->pentry; + totallen = sfi_mtimer_num * sizeof(*pentry); + memcpy(sfi_mtimer_array, pentry, totallen); + } + + printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num); + pentry = sfi_mtimer_array; + for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) { + printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz," + " irq = %d\n", totallen, (u32)pentry->phys_addr, + pentry->freq_hz, pentry->irq); + if (!pentry->irq) + continue; + mp_irq.type = MP_IOAPIC; + mp_irq.irqtype = mp_INT; +/* triggering mode edge bit 2-3, active high polarity bit 0-1 */ + mp_irq.irqflag = 5; + mp_irq.srcbus = 0; + mp_irq.srcbusirq = pentry->irq; /* IRQ */ + mp_irq.dstapic = MP_APIC_ALL; + mp_irq.dstirq = pentry->irq; + save_mp_irq(&mp_irq); + } + + return 0; +} + +struct sfi_timer_table_entry *sfi_get_mtmr(int hint) +{ + int i; + if (hint < sfi_mtimer_num) { + if (!sfi_mtimer_usage[hint]) { + pr_debug("hint taken for timer %d irq %d\n",\ + hint, sfi_mtimer_array[hint].irq); + sfi_mtimer_usage[hint] = 1; + return &sfi_mtimer_array[hint]; + } + } + /* take the first timer available */ + for (i = 0; i < sfi_mtimer_num;) { + if (!sfi_mtimer_usage[i]) { + sfi_mtimer_usage[i] = 1; + return &sfi_mtimer_array[i]; + } + i++; + } + return NULL; +} + +void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr) +{ + int i; + for (i = 0; i < sfi_mtimer_num;) { + if (mtmr->irq == sfi_mtimer_array[i].irq) { + sfi_mtimer_usage[i] = 0; + return; + } + i++; + } +} + /* * Moorestown specific x86_init function overrides and early setup * calls. -- cgit v1.2.3-70-g09d2 From cf089455966e21aeb8e4cd2669e0c1885667b04e Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 12 Feb 2010 03:37:38 -0800 Subject: x86, mrst: Add vrtc platform data setup code vRTC information is obtained from SFI tables on Moorestown, this patch parses these tables and assign the information. Signed-off-by: Feng Tang LKML-Reference: <43F901BD926A4E43B106BF17856F07559FB80D0D@orsmsx508.amr.corp.intel.com> Signed-off-by: Jacob Pan Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/mrst.h | 2 ++ arch/x86/kernel/mrst.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h index fa144f2dd25..451d30e7f62 100644 --- a/arch/x86/include/asm/mrst.h +++ b/arch/x86/include/asm/mrst.h @@ -11,7 +11,9 @@ #ifndef _ASM_X86_MRST_H #define _ASM_X86_MRST_H extern int pci_mrst_init(void); +int __init sfi_parse_mrtc(struct sfi_table_header *table); #define SFI_MTMR_MAX_NUM 8 +#define SFI_MRTC_MAX 8 #endif /* _ASM_X86_MRST_H */ diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index bb6e45c71dd..b7fa049c826 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -27,6 +28,10 @@ static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; int sfi_mtimer_num; +struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; +EXPORT_SYMBOL_GPL(sfi_mrtc_array); +int sfi_mrtc_num; + static inline void assign_to_mp_irq(struct mpc_intsrc *m, struct mpc_intsrc *mp_irq) { @@ -126,6 +131,46 @@ void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr) } } +/* parse all the mrtc info to a global mrtc array */ +int __init sfi_parse_mrtc(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_rtc_table_entry *pentry; + struct mpc_intsrc mp_irq; + + int totallen; + + sb = (struct sfi_table_simple *)table; + if (!sfi_mrtc_num) { + sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb, + struct sfi_rtc_table_entry); + pentry = (struct sfi_rtc_table_entry *)sb->pentry; + totallen = sfi_mrtc_num * sizeof(*pentry); + memcpy(sfi_mrtc_array, pentry, totallen); + } + + printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num); + pentry = sfi_mrtc_array; + for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) { + printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n", + totallen, (u32)pentry->phys_addr, pentry->irq); + mp_irq.type = MP_IOAPIC; + mp_irq.irqtype = mp_INT; + mp_irq.irqflag = 0; + mp_irq.srcbus = 0; + mp_irq.srcbusirq = pentry->irq; /* IRQ */ + mp_irq.dstapic = MP_APIC_ALL; + mp_irq.dstirq = pentry->irq; + save_mp_irq(&mp_irq); + } + return 0; +} + +void __init mrst_rtc_init(void) +{ + sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); +} + /* * Moorestown specific x86_init function overrides and early setup * calls. -- cgit v1.2.3-70-g09d2 From bb24c4716185f6e116c440462c65c1f56649183b Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 2 Sep 2009 07:37:17 -0700 Subject: x86, apbt: Moorestown APB system timer driver Moorestown platform does not have PIT or HPET platform timers. Instead it has a bank of eight APB timers. The number of available timers to the os is exposed via SFI mtmr tables. All APB timer interrupts are routed via ioapic rtes and delivered as MSI. Currently, we use timer 0 and 1 for per cpu clockevent devices, timer 2 for clocksource. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F0755A318D2D2@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- Documentation/kernel-parameters.txt | 6 + arch/x86/Kconfig | 11 + arch/x86/include/asm/apb_timer.h | 70 ++++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/apb_timer.c | 780 ++++++++++++++++++++++++++++++++++++ 5 files changed, 868 insertions(+) create mode 100644 arch/x86/include/asm/apb_timer.h create mode 100644 arch/x86/kernel/apb_timer.c (limited to 'arch/x86/kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 51bceb0fb27..917220459e2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2793,6 +2793,12 @@ and is between 256 and 4096 characters. It is defined in the file default x2apic cluster mode on platforms supporting x2apic. + x86_mrst_timer= [X86-32,APBT] + Choose timer option for x86 Moorestown MID platform. + Two valid options are apbt timer only and lapic timer + plus one apbt timer for broadcast timer. + x86_mrst_timer=apbt_only | lapic_and_apbt + xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks. xd_geo= See header of drivers/block/xd.c. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index eb4092568f9..0ab2dcef7d8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -390,6 +390,7 @@ config X86_MRST bool "Moorestown MID platform" depends on X86_32 depends on X86_EXTENDED_PLATFORM + select APB_TIMER ---help--- Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin Internet Device(MID) platform. Moorestown consists of two chips: @@ -612,6 +613,16 @@ config HPET_EMULATE_RTC def_bool y depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) +config APB_TIMER + def_bool y if MRST + prompt "Langwell APB Timer Support" if X86_MRST + help + APB timer is the replacement for 8254, HPET on X86 MID platforms. + The APBT provides a stable time base on SMP + systems, unlike the TSC, but it is more expensive to access, + as it is off-chip. APB timers are always running regardless of CPU + C states, they are used as per CPU clockevent device when possible. + # Mark as embedded because too many people got it wrong. # The code disables itself when not needed. config DMI diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h new file mode 100644 index 00000000000..c74a2eebe57 --- /dev/null +++ b/arch/x86/include/asm/apb_timer.h @@ -0,0 +1,70 @@ +/* + * apb_timer.h: Driver for Langwell APB timer based on Synopsis DesignWare + * + * (C) Copyright 2009 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + */ + +#ifndef ASM_X86_APBT_H +#define ASM_X86_APBT_H +#include + +#ifdef CONFIG_APB_TIMER + +/* Langwell DW APB timer registers */ +#define APBTMR_N_LOAD_COUNT 0x00 +#define APBTMR_N_CURRENT_VALUE 0x04 +#define APBTMR_N_CONTROL 0x08 +#define APBTMR_N_EOI 0x0c +#define APBTMR_N_INT_STATUS 0x10 + +#define APBTMRS_INT_STATUS 0xa0 +#define APBTMRS_EOI 0xa4 +#define APBTMRS_RAW_INT_STATUS 0xa8 +#define APBTMRS_COMP_VERSION 0xac +#define APBTMRS_REG_SIZE 0x14 + +/* register bits */ +#define APBTMR_CONTROL_ENABLE (1<<0) +#define APBTMR_CONTROL_MODE_PERIODIC (1<<1) /*1: periodic 0:free running */ +#define APBTMR_CONTROL_INT (1<<2) + +/* default memory mapped register base */ +#define LNW_SCU_ADDR 0xFF100000 +#define LNW_EXT_TIMER_OFFSET 0x1B800 +#define APBT_DEFAULT_BASE (LNW_SCU_ADDR+LNW_EXT_TIMER_OFFSET) +#define LNW_EXT_TIMER_PGOFFSET 0x800 + +/* APBT clock speed range from PCLK to fabric base, 25-100MHz */ +#define APBT_MAX_FREQ 50 +#define APBT_MIN_FREQ 1 +#define APBT_MMAP_SIZE 1024 + +#define APBT_DEV_USED 1 + +extern void apbt_time_init(void); +extern struct clock_event_device *global_clock_event; +extern unsigned long apbt_quick_calibrate(void); +extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); +extern void apbt_setup_secondary_clock(void); +extern unsigned int boot_cpu_id; +extern int disable_apbt_percpu; + +extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); +extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); +extern int sfi_mtimer_num; + +#else /* CONFIG_APB_TIMER */ + +static inline unsigned long apbt_quick_calibrate(void) {return 0; } +static inline void apbt_time_init(void) {return 0; } + +#endif +#endif /* ASM_X86_APBT_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d87f09bc5a5..4c58352209e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -87,6 +87,7 @@ obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_HPET_TIMER) += hpet.o +obj-$(CONFIG_APB_TIMER) += apb_timer.o obj-$(CONFIG_K8_NB) += k8.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c new file mode 100644 index 00000000000..83a345b0256 --- /dev/null +++ b/arch/x86/kernel/apb_timer.c @@ -0,0 +1,780 @@ +/* + * apb_timer.c: Driver for Langwell APB timers + * + * (C) Copyright 2009 Intel Corporation + * Author: Jacob Pan (jacob.jun.pan@intel.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * Note: + * Langwell is the south complex of Intel Moorestown MID platform. There are + * eight external timers in total that can be used by the operating system. + * The timer information, such as frequency and addresses, is provided to the + * OS via SFI tables. + * Timer interrupts are routed via FW/HW emulated IOAPIC independently via + * individual redirection table entries (RTE). + * Unlike HPET, there is no master counter, therefore one of the timers are + * used as clocksource. The overall allocation looks like: + * - timer 0 - NR_CPUs for per cpu timer + * - one timer for clocksource + * - one timer for watchdog driver. + * It is also worth notice that APB timer does not support true one-shot mode, + * free-running mode will be used here to emulate one-shot mode. + * APB timer can also be used as broadcast timer along with per cpu local APIC + * timer, but by default APB timer has higher rating than local APIC timers. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define APBT_MASK CLOCKSOURCE_MASK(32) +#define APBT_SHIFT 22 +#define APBT_CLOCKEVENT_RATING 150 +#define APBT_CLOCKSOURCE_RATING 250 +#define APBT_MIN_DELTA_USEC 200 + +#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt) +#define APBT_CLOCKEVENT0_NUM (0) +#define APBT_CLOCKEVENT1_NUM (1) +#define APBT_CLOCKSOURCE_NUM (2) + +static unsigned long apbt_address; +static int apb_timer_block_enabled; +static void __iomem *apbt_virt_address; +static int phy_cs_timer_id; + +/* + * Common DW APB timer info + */ +static uint64_t apbt_freq; + +static void apbt_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt); +static int apbt_next_event(unsigned long delta, + struct clock_event_device *evt); +static cycle_t apbt_read_clocksource(struct clocksource *cs); +static void apbt_restart_clocksource(void); + +struct apbt_dev { + struct clock_event_device evt; + unsigned int num; + int cpu; + unsigned int irq; + unsigned int tick; + unsigned int count; + unsigned int flags; + char name[10]; +}; + +int disable_apbt_percpu __cpuinitdata; + +#ifdef CONFIG_SMP +static unsigned int apbt_num_timers_used; +static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); +static struct apbt_dev *apbt_devs; +#endif + +static inline unsigned long apbt_readl_reg(unsigned long a) +{ + return readl(apbt_virt_address + a); +} + +static inline void apbt_writel_reg(unsigned long d, unsigned long a) +{ + writel(d, apbt_virt_address + a); +} + +static inline unsigned long apbt_readl(int n, unsigned long a) +{ + return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE); +} + +static inline void apbt_writel(int n, unsigned long d, unsigned long a) +{ + writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE); +} + +static inline void apbt_set_mapping(void) +{ + struct sfi_timer_table_entry *mtmr; + + if (apbt_virt_address) { + pr_debug("APBT base already mapped\n"); + return; + } + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return; + } + apbt_address = (unsigned long)mtmr->phys_addr; + if (!apbt_address) { + printk(KERN_WARNING "No timer base from SFI, use default\n"); + apbt_address = APBT_DEFAULT_BASE; + } + apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); + if (apbt_virt_address) { + pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\ + (void *)apbt_address, (void *)apbt_virt_address); + } else { + pr_debug("Failed mapping APBT phy address at %p\n",\ + (void *)apbt_address); + goto panic_noapbt; + } + apbt_freq = mtmr->freq_hz / USEC_PER_SEC; + sfi_free_mtmr(mtmr); + + /* Now figure out the physical timer id for clocksource device */ + mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); + if (mtmr == NULL) + goto panic_noapbt; + + /* Now figure out the physical timer id */ + phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) + / APBTMRS_REG_SIZE; + pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id); + return; + +panic_noapbt: + panic("Failed to setup APB system timer\n"); + +} + +static inline void apbt_clear_mapping(void) +{ + iounmap(apbt_virt_address); + apbt_virt_address = NULL; +} + +/* + * APBT timer interrupt enable / disable + */ +static inline int is_apbt_capable(void) +{ + return apbt_virt_address ? 1 : 0; +} + +static struct clocksource clocksource_apbt = { + .name = "apbt", + .rating = APBT_CLOCKSOURCE_RATING, + .read = apbt_read_clocksource, + .mask = APBT_MASK, + .shift = APBT_SHIFT, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = apbt_restart_clocksource, +}; + +/* boot APB clock event device */ +static struct clock_event_device apbt_clockevent = { + .name = "apbt0", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = apbt_set_mode, + .set_next_event = apbt_next_event, + .shift = APBT_SHIFT, + .irq = 0, + .rating = APBT_CLOCKEVENT_RATING, +}; + +/* + * if user does not want to use per CPU apb timer, just give it a lower rating + * than local apic timer and skip the late per cpu timer init. + */ +static inline int __init setup_x86_mrst_timer(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp("apbt_only", arg) == 0) + disable_apbt_percpu = 0; + else if (strcmp("lapic_and_apbt", arg) == 0) + disable_apbt_percpu = 1; + else { + pr_warning("X86 MRST timer option %s not recognised" + " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", + arg); + return -EINVAL; + } + return 0; +} +__setup("x86_mrst_timer=", setup_x86_mrst_timer); + +/* + * start count down from 0xffff_ffff. this is done by toggling the enable bit + * then load initial load count to ~0. + */ +static void apbt_start_counter(int n) +{ + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT); + /* enable, mask interrupt */ + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT); + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + /* read it once to get cached counter value initialized */ + apbt_read_clocksource(&clocksource_apbt); +} + +static irqreturn_t apbt_interrupt_handler(int irq, void *data) +{ + struct apbt_dev *dev = (struct apbt_dev *)data; + struct clock_event_device *aevt = &dev->evt; + + if (!aevt->event_handler) { + printk(KERN_INFO "Spurious APBT timer interrupt on %d\n", + dev->num); + return IRQ_NONE; + } + aevt->event_handler(aevt); + return IRQ_HANDLED; +} + +static void apbt_restart_clocksource(void) +{ + apbt_start_counter(phy_cs_timer_id); +} + +/* Setup IRQ routing via IOAPIC */ +#ifdef CONFIG_SMP +static void apbt_setup_irq(struct apbt_dev *adev) +{ + struct irq_chip *chip; + struct irq_desc *desc; + + /* timer0 irq has been setup early */ + if (adev->irq == 0) + return; + desc = irq_to_desc(adev->irq); + chip = get_irq_chip(adev->irq); + disable_irq(adev->irq); + desc->status |= IRQ_MOVE_PCNTXT; + irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); + /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */ + set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge"); + enable_irq(adev->irq); + if (system_state == SYSTEM_BOOTING) + if (request_irq(adev->irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + adev->name, adev)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + adev->num); + } +} +#endif + +static void apbt_enable_int(int n) +{ + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + /* clear pending intr */ + apbt_readl(n, APBTMR_N_EOI); + ctrl &= ~APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); +} + +static void apbt_disable_int(int n) +{ + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + + ctrl |= APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); +} + + +static int __init apbt_clockevent_register(void) +{ + struct sfi_timer_table_entry *mtmr; + + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return -ENODEV; + } + + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz + , NSEC_PER_SEC, APBT_SHIFT); + + /* Calculate the min / max delta */ + apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, + &apbt_clockevent); + apbt_clockevent.min_delta_ns = clockevent_delta2ns( + APBT_MIN_DELTA_USEC*apbt_freq, + &apbt_clockevent); + /* + * Start apbt with the boot cpu mask and make it + * global if not used for per cpu timer. + */ + apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); + + if (disable_apbt_percpu) { + apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; + global_clock_event = &apbt_clockevent; + printk(KERN_DEBUG "%s clockevent registered as global\n", + global_clock_event->name); + } + + if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + apbt_clockevent.name, &apbt_clockevent)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + apbt_clockevent.irq); + } + + clockevents_register_device(&apbt_clockevent); + /* Start APBT 0 interrupts */ + apbt_enable_int(APBT_CLOCKEVENT0_NUM); + + sfi_free_mtmr(mtmr); + return 0; +} + +#ifdef CONFIG_SMP +/* Should be called with per cpu */ +void apbt_setup_secondary_clock(void) +{ + struct apbt_dev *adev; + struct clock_event_device *aevt; + int cpu; + + /* Don't register boot CPU clockevent */ + cpu = smp_processor_id(); + if (cpu == boot_cpu_id) + return; + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + printk(KERN_INFO "Init per CPU clockevent %d\n", cpu); + adev = &per_cpu(cpu_apbt_dev, cpu); + aevt = &adev->evt; + + memcpy(aevt, &apbt_clockevent, sizeof(*aevt)); + aevt->cpumask = cpumask_of(cpu); + aevt->name = adev->name; + aevt->mode = CLOCK_EVT_MODE_UNUSED; + + printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n", + cpu, aevt->name, *(u32 *)aevt->cpumask); + + apbt_setup_irq(adev); + + clockevents_register_device(aevt); + + apbt_enable_int(cpu); + + return; +} + +/* + * this notify handler process CPU hotplug events. in case of S0i3, nonboot + * cpus are disabled/enabled frequently, for performance reasons, we keep the + * per cpu timer irq registered so that we do need to do free_irq/request_irq. + * + * TODO: it might be more reliable to directly disable percpu clockevent device + * without the notifier chain. currently, cpu 0 may get interrupts from other + * cpu timers during the offline process due to the ordering of notification. + * the extra interrupt is harmless. + */ +static int apbt_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + unsigned long cpu = (unsigned long)hcpu; + struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); + + switch (action & 0xf) { + case CPU_DEAD: + apbt_disable_int(cpu); + if (system_state == SYSTEM_RUNNING) + pr_debug("skipping APBT CPU %lu offline\n", cpu); + else if (adev) { + pr_debug("APBT clockevent for cpu %lu offline\n", cpu); + free_irq(adev->irq, adev); + } + break; + default: + pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); + } + return NOTIFY_OK; +} + +static __init int apbt_late_init(void) +{ + if (disable_apbt_percpu) + return 0; + /* This notifier should be called after workqueue is ready */ + hotcpu_notifier(apbt_cpuhp_notify, -20); + return 0; +} +fs_initcall(apbt_late_init); +#else + +void apbt_setup_secondary_clock(void) {} + +#endif /* CONFIG_SMP */ + +static void apbt_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long ctrl; + uint64_t delta; + int timer_num; + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + pr_debug("%s CPU %d timer %d mode=%d\n", + __func__, first_cpu(*evt->cpumask), timer_num, mode); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult; + delta >>= apbt_clockevent.shift; + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl |= APBTMR_CONTROL_MODE_PERIODIC; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* + * DW APB p. 46, have to disable timer before load counter, + * may cause sync problem. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + udelay(1); + pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ); + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + /* APB timer does not have one-shot mode, use free running mode */ + case CLOCK_EVT_MODE_ONESHOT: + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + /* + * set free running mode, this mode will let timer reload max + * timeout which will give time (3min on 25MHz clock) to rearm + * the next event, therefore emulate the one-shot mode. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write again to set free running mode */ + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + + /* + * DW APB p. 46, load counter with all 1s before starting free + * running mode. + */ + apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT); + ctrl &= ~APBTMR_CONTROL_INT; + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + apbt_disable_int(timer_num); + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_RESUME: + apbt_enable_int(timer_num); + break; + } +} + +static int apbt_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + unsigned long ctrl; + int timer_num; + + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + /* Disable timer */ + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write new count */ + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + return 0; +} + +/* + * APB timer clock is not in sync with pclk on Langwell, which translates to + * unreliable read value caused by sampling error. the error does not add up + * overtime and only happens when sampling a 0 as a 1 by mistake. so the time + * would go backwards. the following code is trying to prevent time traveling + * backwards. little bit paranoid. + */ +static cycle_t apbt_read_clocksource(struct clocksource *cs) +{ + unsigned long t0, t1, t2; + static unsigned long last_read; + +bad_count: + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if (unlikely(t1 < t2)) { + pr_debug("APBT: read current count error %lx:%lx:%lx\n", + t1, t2, t2 - t1); + goto bad_count; + } + /* + * check against cached last read, makes sure time does not go back. + * it could be a normal rollover but we will do tripple check anyway + */ + if (unlikely(t2 > last_read)) { + /* check if we have a normal rollover */ + unsigned long raw_intr_status = + apbt_readl_reg(APBTMRS_RAW_INT_STATUS); + /* + * cs timer interrupt is masked but raw intr bit is set if + * rollover occurs. then we read EOI reg to clear it. + */ + if (raw_intr_status & (1 << phy_cs_timer_id)) { + apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); + goto out; + } + pr_debug("APB CS going back %lx:%lx:%lx ", + t2, last_read, t2 - last_read); +bad_count_x3: + pr_debug(KERN_INFO "tripple check enforced\n"); + t0 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if ((t2 > t1) || (t1 > t0)) { + printk(KERN_ERR "Error: APB CS tripple check failed\n"); + goto bad_count_x3; + } + } +out: + last_read = t2; + return (cycle_t)~t2; +} + +static int apbt_clocksource_register(void) +{ + u64 start, now; + cycle_t t1; + + /* Start the counter, use timer 2 as source, timer 0/1 for event */ + apbt_start_counter(phy_cs_timer_id); + + /* Verify whether apbt counter works */ + t1 = apbt_read_clocksource(&clocksource_apbt); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + rdtscll(now); + } while ((now - start) < 200000UL); + + /* APBT is the only always on clocksource, it has to work! */ + if (t1 == apbt_read_clocksource(&clocksource_apbt)) + panic("APBT counter not counting. APBT disabled\n"); + + /* + * initialize and register APBT clocksource + * convert that to ns/clock cycle + * mult = (ns/c) * 2^APBT_SHIFT + */ + clocksource_apbt.mult = div_sc(MSEC_PER_SEC, + (unsigned long) apbt_freq, APBT_SHIFT); + clocksource_register(&clocksource_apbt); + + return 0; +} + +/* + * Early setup the APBT timer, only use timer 0 for booting then switch to + * per CPU timer if possible. + * returns 1 if per cpu apbt is setup + * returns 0 if no per cpu apbt is chosen + * panic if set up failed, this is the only platform timer on Moorestown. + */ +void __init apbt_time_init(void) +{ +#ifdef CONFIG_SMP + int i; + struct sfi_timer_table_entry *p_mtmr; + unsigned int percpu_timer; + struct apbt_dev *adev; +#endif + + if (apb_timer_block_enabled) + return; + apbt_set_mapping(); + if (apbt_virt_address) { + pr_debug("Found APBT version 0x%lx\n",\ + apbt_readl_reg(APBTMRS_COMP_VERSION)); + } else + goto out_noapbt; + /* + * Read the frequency and check for a sane value, for ESL model + * we extend the possible clock range to allow time scaling. + */ + + if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { + pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq); + goto out_noapbt; + } + if (apbt_clocksource_register()) { + pr_debug("APBT has failed to register clocksource\n"); + goto out_noapbt; + } + if (!apbt_clockevent_register()) + apb_timer_block_enabled = 1; + else { + pr_debug("APBT has failed to register clockevent\n"); + goto out_noapbt; + } +#ifdef CONFIG_SMP + /* kernel cmdline disable apb timer, so we will use lapic timers */ + if (disable_apbt_percpu) { + printk(KERN_INFO "apbt: disabled per cpu timer\n"); + return; + } + pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); + if (num_possible_cpus() <= sfi_mtimer_num) { + percpu_timer = 1; + apbt_num_timers_used = num_possible_cpus(); + } else { + percpu_timer = 0; + apbt_num_timers_used = 1; + adev = &per_cpu(cpu_apbt_dev, 0); + adev->flags &= ~APBT_DEV_USED; + } + pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); + + /* here we set up per CPU timer data structure */ + apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used, + GFP_KERNEL); + if (!apbt_devs) { + printk(KERN_ERR "Failed to allocate APB timer devices\n"); + return; + } + for (i = 0; i < apbt_num_timers_used; i++) { + adev = &per_cpu(cpu_apbt_dev, i); + adev->num = i; + adev->cpu = i; + p_mtmr = sfi_get_mtmr(i); + if (p_mtmr) { + adev->tick = p_mtmr->freq_hz; + adev->irq = p_mtmr->irq; + } else + printk(KERN_ERR "Failed to get timer for cpu %d\n", i); + adev->count = 0; + sprintf(adev->name, "apbt%d", i); + } +#endif + + return; + +out_noapbt: + apbt_clear_mapping(); + apb_timer_block_enabled = 0; + panic("failed to enable APB timer\n"); +} + +static inline void apbt_disable(int n) +{ + if (is_apbt_capable()) { + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + } +} + +/* called before apb_timer_enable, use early map */ +unsigned long apbt_quick_calibrate() +{ + int i, scale; + u64 old, new; + cycle_t t1, t2; + unsigned long khz = 0; + u32 loop, shift; + + apbt_set_mapping(); + apbt_start_counter(phy_cs_timer_id); + + /* check if the timer can count down, otherwise return */ + old = apbt_read_clocksource(&clocksource_apbt); + i = 10000; + while (--i) { + if (old != apbt_read_clocksource(&clocksource_apbt)) + break; + } + if (!i) + goto failed; + + /* count 16 ms */ + loop = (apbt_freq * 1000) << 4; + + /* restart the timer to ensure it won't get to 0 in the calibration */ + apbt_start_counter(phy_cs_timer_id); + + old = apbt_read_clocksource(&clocksource_apbt); + old += loop; + + t1 = __native_read_tsc(); + + do { + new = apbt_read_clocksource(&clocksource_apbt); + } while (new < old); + + t2 = __native_read_tsc(); + + shift = 5; + if (unlikely(loop >> shift == 0)) { + printk(KERN_INFO + "APBT TSC calibration failed, not enough resolution\n"); + return 0; + } + scale = (int)div_u64((t2 - t1), loop >> shift); + khz = (scale * apbt_freq * 1000) >> shift; + printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); + return khz; +failed: + return 0; +} -- cgit v1.2.3-70-g09d2 From 3746c6b6e26b8ad605f11b43e54acb3481d40980 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 12 Feb 2010 05:01:12 -0800 Subject: x86, mrst: Platform clock setup code Add Moorestown platform clock setup code to the x86_init abstraction. Signed-off-by: Jacob Pan LKML-Reference: <43F901BD926A4E43B106BF17856F0755A318D2D4@orsmsx508.amr.corp.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/mrst.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index b7fa049c826..0aad8670858 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -23,6 +23,7 @@ #include #include #include +#include static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; @@ -166,11 +167,55 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) return 0; } +/* + * the secondary clock in Moorestown can be APBT or LAPIC clock, default to + * APBT but cmdline option can also override it. + */ +static void __cpuinit mrst_setup_secondary_clock(void) +{ + /* restore default lapic clock if disabled by cmdline */ + if (disable_apbt_percpu) + return setup_secondary_APIC_clock(); + apbt_setup_secondary_clock(); +} + +static unsigned long __init mrst_calibrate_tsc(void) +{ + unsigned long flags, fast_calibrate; + + local_irq_save(flags); + fast_calibrate = apbt_quick_calibrate(); + local_irq_restore(flags); + + if (fast_calibrate) + return fast_calibrate; + + return 0; +} + +void __init mrst_time_init(void) +{ + sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); + pre_init_apic_IRQ0(); + apbt_time_init(); +} + void __init mrst_rtc_init(void) { sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); } +/* + * if we use per cpu apb timer, the bootclock already setup. if we use lapic + * timer and one apbt timer for broadcast, we need to set up lapic boot clock. + */ +static void __init mrst_setup_boot_clock(void) +{ + pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); + if (disable_apbt_percpu) + setup_boot_APIC_clock(); +}; + /* * Moorestown specific x86_init function overrides and early setup * calls. @@ -180,6 +225,14 @@ void __init x86_mrst_early_setup(void) x86_init.resources.probe_roms = x86_init_noop; x86_init.resources.reserve_resources = x86_init_noop; + x86_init.timers.timer_init = mrst_time_init; + x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; + + x86_init.irqs.pre_vector_init = x86_init_noop; + + x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; + + x86_platform.calibrate_tsc = mrst_calibrate_tsc; x86_init.pci.init = pci_mrst_init; x86_init.pci.fixup_irqs = x86_init_noop; -- cgit v1.2.3-70-g09d2 From 28c6a0ba30457380b140d9d7a61530eda8969180 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 23 Feb 2010 20:27:48 -0800 Subject: x86, legacy_irq: Remove left over nr_legacy_irqs nr_legacy_irqs and its ilk have moved to legacy_pic. -v2: there is one in ioapic_.c Singed-off-by: Yinghai Lu LKML-Reference: <4B84AAC4.2020204@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/irq.h | 1 - arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/irqinit.c | 7 ++----- 3 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index 262292729fc..5458380b6ef 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -48,6 +48,5 @@ extern DECLARE_BITMAP(used_vectors, NR_VECTORS); extern int vector_used_by_percpu_irq(unsigned int vector); extern void init_ISA_irqs(void); -extern int nr_legacy_irqs; #endif /* _ASM_X86_IRQ_H */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 8c848b5877a..b9d08f0e1e3 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1440,7 +1440,7 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq * controllers like 8259. Now that IO-APIC can handle this irq, update * the cfg->domain. */ - if (irq < nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) + if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain)) apic->vector_allocation_domain(0, cfg->domain); if (assign_irq_vector(irq, cfg, apic->target_cpus())) diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index d2f787b3de5..ef257fc2921 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -99,9 +99,6 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -/* Number of legacy interrupts */ -int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; - void __init init_ISA_irqs(void) { int i; @@ -114,7 +111,7 @@ void __init init_ISA_irqs(void) /* * 16 old-style INTA-cycle interrupts: */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { + for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) { struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; @@ -138,7 +135,7 @@ void __init init_IRQ(void) * then this vector space can be freed and re-used dynamically as the * irq's migrate etc. */ - for (i = 0; i < nr_legacy_irqs; i++) + for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i; x86_init.irqs.intr_init(); -- cgit v1.2.3-70-g09d2 From 9eeeb09edba1e3544526611663472743ca584d36 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 23 Feb 2010 18:49:04 -0800 Subject: x86, legacy_irq: Remove duplicate vector assigment Remove duplicated cfg[i].vector assignment. Signed-off-by: Yinghai Lu LKML-Reference: <4B8493A0.6080501@kernel.org> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/io_apic.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index b9d08f0e1e3..b758d49b811 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -161,8 +161,6 @@ int __init arch_early_irq_init(void) node= cpu_to_node(boot_cpu_id); for (i = 0; i < count; i++) { - if (i < legacy_pic->nr_legacy_irqs) - cfg[i].vector = IRQ0_VECTOR + i; desc = irq_to_desc(i); desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); -- cgit v1.2.3-70-g09d2 From e808bae2407a087bfd40200a27587898e5a9909d Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Tue, 9 Feb 2010 21:38:45 -0200 Subject: x86: Do not reserve brk for DMI if it's not going to be used This will save 64K bytes from memory when loading linux if DMI is disabled, which is good for embedded systems. Signed-off-by: Thadeu Lima de Souza Cascardo LKML-Reference: <1265758732-19320-1-git-send-email-cascardo@holoscopio.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3499b4fabc9..cb42109a55b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -121,7 +121,9 @@ unsigned long max_low_pfn_mapped; unsigned long max_pfn_mapped; +#ifdef CONFIG_DMI RESERVE_BRK(dmi_alloc, 65536); +#endif unsigned int boot_cpu_id __read_mostly; -- cgit v1.2.3-70-g09d2 From 0c54dd341fb701928b8e5dca91ced1870c55b05b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 25 Feb 2010 08:42:06 -0500 Subject: ftrace: Remove memory barriers from NMI code when not needed The code in stop_machine that modifies the kernel text has a bit of logic to handle the case of NMIs. stop_machine does not prevent NMIs from executing, and if an NMI were to trigger on another CPU as the modifying CPU is changing the NMI text, a GPF could result. To prevent the GPF, the NMI calls ftrace_nmi_enter() which may modify the code first, then any other NMIs will just change the text to the same content which will do no harm. The code that stop_machine called must wait for NMIs to finish while it changes each location in the kernel. That code may also change the text to what the NMI changed it to. The key is that the text will never change content while another CPU is executing it. To make the above work, the call to ftrace_nmi_enter() must also do a smp_mb() as well as atomic_inc(). But for applications like perf that require a high number of NMIs for profiling, this can have a dramatic effect on the system. Not only is it doing a full memory barrier on both nmi_enter() as well as nmi_exit() it is also modifying a global variable with an atomic operation. This kills performance on large SMP machines. Since the memory barriers are only needed when ftrace is in the process of modifying the text (which is seldom), this patch adds a "modifying_code" variable that gets set before stop machine is executed and cleared afterwards. The NMIs will check this variable and store it in a per CPU "save_modifying_code" variable that it will use to check if it needs to do the memory barriers and atomic dec on NMI exit. Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- arch/x86/kernel/ftrace.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 30968924543..605ef196fdd 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -30,14 +30,32 @@ #ifdef CONFIG_DYNAMIC_FTRACE +/* + * modifying_code is set to notify NMIs that they need to use + * memory barriers when entering or exiting. But we don't want + * to burden NMIs with unnecessary memory barriers when code + * modification is not being done (which is most of the time). + * + * A mutex is already held when ftrace_arch_code_modify_prepare + * and post_process are called. No locks need to be taken here. + * + * Stop machine will make sure currently running NMIs are done + * and new NMIs will see the updated variable before we need + * to worry about NMIs doing memory barriers. + */ +static int modifying_code __read_mostly; +static DEFINE_PER_CPU(int, save_modifying_code); + int ftrace_arch_code_modify_prepare(void) { set_kernel_text_rw(); + modifying_code = 1; return 0; } int ftrace_arch_code_modify_post_process(void) { + modifying_code = 0; set_kernel_text_ro(); return 0; } @@ -149,6 +167,11 @@ static void ftrace_mod_code(void) void ftrace_nmi_enter(void) { + __get_cpu_var(save_modifying_code) = modifying_code; + + if (!__get_cpu_var(save_modifying_code)) + return; + if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { smp_rmb(); ftrace_mod_code(); @@ -160,6 +183,9 @@ void ftrace_nmi_enter(void) void ftrace_nmi_exit(void) { + if (!__get_cpu_var(save_modifying_code)) + return; + /* Finish all executions before clearing nmi_running */ smp_mb(); atomic_dec(&nmi_running); -- cgit v1.2.3-70-g09d2 From d498f763950703c724c650db1d34a1c8679f9ca8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 Feb 2010 08:33:49 -0500 Subject: kprobes/x86: Cleanup RELATIVEJUMP_INSTRUCTION to RELATIVEJUMP_OPCODE Change RELATIVEJUMP_INSTRUCTION macro to RELATIVEJUMP_OPCODE since it represents just the opcode byte. Signed-off-by: Masami Hiramatsu Acked-by: Mathieu Desnoyers Cc: systemtap Cc: DLE Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Srikar Dronamraju Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Anders Kaseorg Cc: Tim Abbott Cc: Andi Kleen Cc: Jason Baron Cc: Mathieu Desnoyers Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli LKML-Reference: <20100225133349.6725.99302.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kprobes.h | 2 +- arch/x86/kernel/kprobes.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 4fe681de1e7..eaec8ea7bf1 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -32,7 +32,7 @@ struct kprobe; typedef u8 kprobe_opcode_t; #define BREAKPOINT_INSTRUCTION 0xcc -#define RELATIVEJUMP_INSTRUCTION 0xe9 +#define RELATIVEJUMP_OPCODE 0xe9 #define MAX_INSN_SIZE 16 #define MAX_STACK_SIZE 64 #define MIN_STACK_SIZE(ADDR) \ diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 5de9f4a9c3f..15177cdfdd0 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -115,7 +115,7 @@ static void __kprobes set_jmp_op(void *from, void *to) } __attribute__((packed)) * jop; jop = (struct __arch_jmp_op *)from; jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); - jop->op = RELATIVEJUMP_INSTRUCTION; + jop->op = RELATIVEJUMP_OPCODE; } /* -- cgit v1.2.3-70-g09d2 From 0f94eb634ef7af736dee5639aac1c2fe9635d089 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 Feb 2010 08:34:23 -0500 Subject: kprobes/x86: Boost probes when reentering Integrate prepare_singlestep() into setup_singlestep() to boost up reenter probes, if possible. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Srikar Dronamraju Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Anders Kaseorg Cc: Tim Abbott Cc: Andi Kleen Cc: Jason Baron Cc: Mathieu Desnoyers Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli LKML-Reference: <20100225133423.6725.12071.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 48 +++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 15177cdfdd0..c69bb65006f 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -406,18 +406,6 @@ static void __kprobes restore_btf(void) update_debugctlmsr(current->thread.debugctlmsr); } -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - clear_btf(); - regs->flags |= X86_EFLAGS_TF; - regs->flags &= ~X86_EFLAGS_IF; - /* single step inline if the instruction is an int3 */ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->ip = (unsigned long)p->addr; - else - regs->ip = (unsigned long)p->ainsn.insn; -} - void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -430,19 +418,38 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, } static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) + struct kprobe_ctlblk *kcb, int reenter) { #if !defined(CONFIG_PREEMPT) if (p->ainsn.boostable == 1 && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ - reset_current_kprobe(); + if (!reenter) + reset_current_kprobe(); + /* + * Reentering boosted probe doesn't reset current_kprobe, + * nor set current_kprobe, because it doesn't use single + * stepping. + */ regs->ip = (unsigned long)p->ainsn.insn; preempt_enable_no_resched(); return; } #endif - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_HIT_SS; + if (reenter) { + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_REENTER; + } else + kcb->kprobe_status = KPROBE_HIT_SS; + /* Prepare real single stepping */ + clear_btf(); + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + /* single step inline if the instruction is an int3 */ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->ip = (unsigned long)p->addr; + else + regs->ip = (unsigned long)p->ainsn.insn; } /* @@ -456,11 +463,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs, switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: case KPROBE_HIT_ACTIVE: - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); kprobes_inc_nmissed_count(p); - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_REENTER; + setup_singlestep(p, regs, kcb, 1); break; case KPROBE_HIT_SS: /* A probe has been hit in the codepath leading up to, or just @@ -535,13 +539,13 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) * more here. */ if (!p->pre_handler || !p->pre_handler(p, regs)) - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } } else if (kprobe_running()) { p = __get_cpu_var(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } } /* else: not a kprobe fault; let the kernel handle it */ -- cgit v1.2.3-70-g09d2 From f007ea2685692bafb386820144cf73a14016fc7c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 Feb 2010 08:34:30 -0500 Subject: kprobes/x86: Cleanup save/restore registers Introduce SAVE/RESOTRE_REGS_STRING for cleanup kretprobe-trampoline asm code. These macros will be used for emulating interruption. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Srikar Dronamraju Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Anders Kaseorg Cc: Tim Abbott Cc: Andi Kleen Cc: Jason Baron Cc: Mathieu Desnoyers Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli LKML-Reference: <20100225133430.6725.83342.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 128 ++++++++++++++++++++++++---------------------- 1 file changed, 67 insertions(+), 61 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index c69bb65006f..4ae95befd0e 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -554,6 +554,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) return 0; } +#ifdef CONFIG_X86_64 +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax. */ \ + " subq $24, %rsp\n" \ + " pushq %rdi\n" \ + " pushq %rsi\n" \ + " pushq %rdx\n" \ + " pushq %rcx\n" \ + " pushq %rax\n" \ + " pushq %r8\n" \ + " pushq %r9\n" \ + " pushq %r10\n" \ + " pushq %r11\n" \ + " pushq %rbx\n" \ + " pushq %rbp\n" \ + " pushq %r12\n" \ + " pushq %r13\n" \ + " pushq %r14\n" \ + " pushq %r15\n" +#define RESTORE_REGS_STRING \ + " popq %r15\n" \ + " popq %r14\n" \ + " popq %r13\n" \ + " popq %r12\n" \ + " popq %rbp\n" \ + " popq %rbx\n" \ + " popq %r11\n" \ + " popq %r10\n" \ + " popq %r9\n" \ + " popq %r8\n" \ + " popq %rax\n" \ + " popq %rcx\n" \ + " popq %rdx\n" \ + " popq %rsi\n" \ + " popq %rdi\n" \ + /* Skip orig_ax, ip, cs */ \ + " addq $24, %rsp\n" +#else +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax and gs. */ \ + " subl $16, %esp\n" \ + " pushl %fs\n" \ + " pushl %ds\n" \ + " pushl %es\n" \ + " pushl %eax\n" \ + " pushl %ebp\n" \ + " pushl %edi\n" \ + " pushl %esi\n" \ + " pushl %edx\n" \ + " pushl %ecx\n" \ + " pushl %ebx\n" +#define RESTORE_REGS_STRING \ + " popl %ebx\n" \ + " popl %ecx\n" \ + " popl %edx\n" \ + " popl %esi\n" \ + " popl %edi\n" \ + " popl %ebp\n" \ + " popl %eax\n" \ + /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ + " addl $24, %esp\n" +#endif + /* * When a retprobed function returns, this code saves registers and * calls trampoline_handler() runs, which calls the kretprobe's handler. @@ -567,65 +630,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void) /* We don't bother saving the ss register */ " pushq %rsp\n" " pushfq\n" - /* - * Skip cs, ip, orig_ax. - * trampoline_handler() will plug in these values - */ - " subq $24, %rsp\n" - " pushq %rdi\n" - " pushq %rsi\n" - " pushq %rdx\n" - " pushq %rcx\n" - " pushq %rax\n" - " pushq %r8\n" - " pushq %r9\n" - " pushq %r10\n" - " pushq %r11\n" - " pushq %rbx\n" - " pushq %rbp\n" - " pushq %r12\n" - " pushq %r13\n" - " pushq %r14\n" - " pushq %r15\n" + SAVE_REGS_STRING " movq %rsp, %rdi\n" " call trampoline_handler\n" /* Replace saved sp with true return address. */ " movq %rax, 152(%rsp)\n" - " popq %r15\n" - " popq %r14\n" - " popq %r13\n" - " popq %r12\n" - " popq %rbp\n" - " popq %rbx\n" - " popq %r11\n" - " popq %r10\n" - " popq %r9\n" - " popq %r8\n" - " popq %rax\n" - " popq %rcx\n" - " popq %rdx\n" - " popq %rsi\n" - " popq %rdi\n" - /* Skip orig_ax, ip, cs */ - " addq $24, %rsp\n" + RESTORE_REGS_STRING " popfq\n" #else " pushf\n" - /* - * Skip cs, ip, orig_ax and gs. - * trampoline_handler() will plug in these values - */ - " subl $16, %esp\n" - " pushl %fs\n" - " pushl %es\n" - " pushl %ds\n" - " pushl %eax\n" - " pushl %ebp\n" - " pushl %edi\n" - " pushl %esi\n" - " pushl %edx\n" - " pushl %ecx\n" - " pushl %ebx\n" + SAVE_REGS_STRING " movl %esp, %eax\n" " call trampoline_handler\n" /* Move flags to cs */ @@ -633,15 +647,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void) " movl %edx, 52(%esp)\n" /* Replace saved flags with true return address. */ " movl %eax, 56(%esp)\n" - " popl %ebx\n" - " popl %ecx\n" - " popl %edx\n" - " popl %esi\n" - " popl %edi\n" - " popl %ebp\n" - " popl %eax\n" - /* Skip ds, es, fs, gs, orig_ax and ip */ - " addl $24, %esp\n" + RESTORE_REGS_STRING " popf\n" #endif " ret\n"); -- cgit v1.2.3-70-g09d2 From 3d55cc8a058ee96291d6d45b1e35121b9920eca3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 Feb 2010 08:34:38 -0500 Subject: x86: Add text_poke_smp for SMP cross modifying code Add generic text_poke_smp for SMP which uses stop_machine() to synchronize modifying code. This stop_machine() method is officially described at "7.1.3 Handling Self- and Cross-Modifying Code" on the intel's software developer's manual 3A. Since stop_machine() can't protect code against NMI/MCE, this function can not modify those handlers. And also, this function is basically for modifying multibyte-single-instruction. For modifying multibyte-multi-instructions, we need another special trap & detour code. This code originaly comes from immediate values with stop_machine() version. Thanks Jason and Mathieu! Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Mathieu Desnoyers Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Srikar Dronamraju Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Anders Kaseorg Cc: Tim Abbott Cc: Andi Kleen Cc: Jason Baron Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli LKML-Reference: <20100225133438.6725.80273.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 4 ++- arch/x86/kernel/alternative.c | 60 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index ac80b7d7001..643d6ab3588 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -160,10 +160,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. * More care must be taken when modifying code in the SMP case because of - * Intel's errata. + * Intel's errata. text_poke_smp() takes care that errata, but still + * doesn't support NMI/MCE handler code modifying. * On the local CPU you need to be protected again NMI or MCE handlers seeing an * inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); +extern void *text_poke_smp(void *addr, const void *opcode, size_t len); #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index e63b80e5861..c41f13c15e8 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -570,3 +571,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) local_irq_restore(flags); return addr; } + +/* + * Cross-modifying kernel text with stop_machine(). + * This code originally comes from immediate value. + */ +static atomic_t stop_machine_first; +static int wrote_text; + +struct text_poke_params { + void *addr; + const void *opcode; + size_t len; +}; + +static int __kprobes stop_machine_text_poke(void *data) +{ + struct text_poke_params *tpp = data; + + if (atomic_dec_and_test(&stop_machine_first)) { + text_poke(tpp->addr, tpp->opcode, tpp->len); + smp_wmb(); /* Make sure other cpus see that this has run */ + wrote_text = 1; + } else { + while (!wrote_text) + smp_rmb(); + sync_core(); + } + + flush_icache_range((unsigned long)tpp->addr, + (unsigned long)tpp->addr + tpp->len); + return 0; +} + +/** + * text_poke_smp - Update instructions on a live kernel on SMP + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Modify multi-byte instruction by using stop_machine() on SMP. This allows + * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying + * should be allowed, since stop_machine() does _not_ protect code against + * NMI and MCE. + * + * Note: Must be called under get_online_cpus() and text_mutex. + */ +void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) +{ + struct text_poke_params tpp; + + tpp.addr = addr; + tpp.opcode = opcode; + tpp.len = len; + atomic_set(&stop_machine_first, 1); + wrote_text = 0; + stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); + return addr; +} + -- cgit v1.2.3-70-g09d2 From c0f7ac3a9edde786bc129d37627953a8b8abefdf Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 Feb 2010 08:34:46 -0500 Subject: kprobes/x86: Support kprobes jump optimization on x86 Introduce x86 arch-specific optimization code, which supports both of x86-32 and x86-64. This code also supports safety checking, which decodes whole of a function in which probe is inserted, and checks following conditions before optimization: - The optimized instructions which will be replaced by a jump instruction don't straddle the function boundary. - There is no indirect jump instruction, because it will jumps into the address range which is replaced by jump operand. - There is no jump/loop instruction which jumps into the address range which is replaced by jump operand. - Don't optimize kprobes if it is in functions into which fixup code will jumps. This uses text_poke_multibyte() which doesn't support modifying code on NMI/MCE handler. However, since kprobes itself doesn't support NMI/MCE code probing, it's not a problem. Changes in v9: - Use *_text_reserved() for checking the probe can be optimized. - Verify jump address range is in 2G range when preparing slot. - Backup original code when switching optimized buffer, instead of preparing buffer, because there can be int3 of other probes in preparing phase. - Check kprobe is disabled in arch_check_optimized_kprobe(). - Strictly check indirect jump opcodes (ff /4, ff /5). Changes in v6: - Split stop_machine-based jump patching code. - Update comments and coding style. Changes in v5: - Introduce stop_machine-based jump replacing. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Srikar Dronamraju Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Anders Kaseorg Cc: Tim Abbott Cc: Andi Kleen Cc: Jason Baron Cc: Mathieu Desnoyers Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli LKML-Reference: <20100225133446.6725.78994.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/include/asm/kprobes.h | 29 +++ arch/x86/kernel/kprobes.c | 433 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 441 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cbcbfdee3ee..e6f5a98d515 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -31,6 +31,7 @@ config X86 select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS select HAVE_KRETPROBES + select HAVE_OPTPROBES select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index eaec8ea7bf1..4ffa345a8cc 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -33,6 +33,9 @@ struct kprobe; typedef u8 kprobe_opcode_t; #define BREAKPOINT_INSTRUCTION 0xcc #define RELATIVEJUMP_OPCODE 0xe9 +#define RELATIVEJUMP_SIZE 5 +#define RELATIVECALL_OPCODE 0xe8 +#define RELATIVE_ADDR_SIZE 4 #define MAX_INSN_SIZE 16 #define MAX_STACK_SIZE 64 #define MIN_STACK_SIZE(ADDR) \ @@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t; #define flush_insn_slot(p) do { } while (0) +/* optinsn template addresses */ +extern kprobe_opcode_t optprobe_template_entry; +extern kprobe_opcode_t optprobe_template_val; +extern kprobe_opcode_t optprobe_template_call; +extern kprobe_opcode_t optprobe_template_end; +#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) +#define MAX_OPTINSN_SIZE \ + (((unsigned long)&optprobe_template_end - \ + (unsigned long)&optprobe_template_entry) + \ + MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE) + extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); @@ -64,6 +78,21 @@ struct arch_specific_insn { int boostable; }; +struct arch_optimized_insn { + /* copy of the original instructions */ + kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE]; + /* detour code buffer */ + kprobe_opcode_t *insn; + /* the size of instructions copied to detour code buffer */ + size_t size; +}; + +/* Return true (!0) if optinsn is prepared for optimization. */ +static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn) +{ + return optinsn->size; +} + struct prev_kprobe { struct kprobe *kp; unsigned long status; diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 4ae95befd0e..b43bbaebe2c 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -106,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = { }; const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); -/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -static void __kprobes set_jmp_op(void *from, void *to) +static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) { - struct __arch_jmp_op { - char op; + struct __arch_relative_insn { + u8 op; s32 raddr; - } __attribute__((packed)) * jop; - jop = (struct __arch_jmp_op *)from; - jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); - jop->op = RELATIVEJUMP_OPCODE; + } __attribute__((packed)) *insn; + + insn = (struct __arch_relative_insn *)from; + insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); + insn->op = op; +} + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +static void __kprobes synthesize_reljump(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); } /* @@ -202,7 +209,7 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr) /* * Basically, kp->ainsn.insn has an original instruction. * However, RIP-relative instruction can not do single-stepping - * at different place, fix_riprel() tweaks the displacement of + * at different place, __copy_instruction() tweaks the displacement of * that instruction. In that case, we can't recover the instruction * from the kp->ainsn.insn. * @@ -284,21 +291,37 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) } /* - * Adjust the displacement if the instruction uses the %rip-relative - * addressing mode. + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode. * If it does, Return the address of the 32-bit displacement word. * If not, return null. * Only applicable to 64-bit x86. */ -static void __kprobes fix_riprel(struct kprobe *p) +static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover) { -#ifdef CONFIG_X86_64 struct insn insn; - kernel_insn_init(&insn, p->ainsn.insn); + int ret; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + kernel_insn_init(&insn, src); + if (recover) { + insn_get_opcode(&insn); + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, + (unsigned long)src); + if (ret) + return 0; + kernel_insn_init(&insn, buf); + } + } + insn_get_length(&insn); + memcpy(dest, insn.kaddr, insn.length); + +#ifdef CONFIG_X86_64 if (insn_rip_relative(&insn)) { s64 newdisp; u8 *disp; + kernel_insn_init(&insn, dest); insn_get_displacement(&insn); /* * The copied instruction uses the %rip-relative addressing @@ -312,20 +335,23 @@ static void __kprobes fix_riprel(struct kprobe *p) * extension of the original signed 32-bit displacement would * have given. */ - newdisp = (u8 *) p->addr + (s64) insn.displacement.value - - (u8 *) p->ainsn.insn; + newdisp = (u8 *) src + (s64) insn.displacement.value - + (u8 *) dest; BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ - disp = (u8 *) p->ainsn.insn + insn_offset_displacement(&insn); + disp = (u8 *) dest + insn_offset_displacement(&insn); *(s32 *) disp = (s32) newdisp; } #endif + return insn.length; } static void __kprobes arch_copy_kprobe(struct kprobe *p) { - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - - fix_riprel(p); + /* + * Copy an instruction without recovering int3, because it will be + * put by another subsystem. + */ + __copy_instruction(p->ainsn.insn, p->addr, 0); if (can_boost(p->addr)) p->ainsn.boostable = 0; @@ -417,9 +443,20 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, *sara = (unsigned long) &kretprobe_trampoline; } +#ifdef CONFIG_OPTPROBES +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter); +#else +#define setup_detour_execution(p, regs, reenter) (0) +#endif + static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter) { + if (setup_detour_execution(p, regs, reenter)) + return; + #if !defined(CONFIG_PREEMPT) if (p->ainsn.boostable == 1 && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ @@ -815,8 +852,8 @@ static void __kprobes resume_execution(struct kprobe *p, * These instructions can be executed directly if it * jumps back to correct address. */ - set_jmp_op((void *)regs->ip, - (void *)orig_ip + (regs->ip - copy_ip)); + synthesize_reljump((void *)regs->ip, + (void *)orig_ip + (regs->ip - copy_ip)); p->ainsn.boostable = 1; } else { p->ainsn.boostable = -1; @@ -1043,6 +1080,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } + +#ifdef CONFIG_OPTPROBES + +/* Insert a call instruction at address 'from', which calls address 'to'.*/ +static void __kprobes synthesize_relcall(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); +} + +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ +static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, + unsigned long val) +{ +#ifdef CONFIG_X86_64 + *addr++ = 0x48; + *addr++ = 0xbf; +#else + *addr++ = 0xb8; +#endif + *(unsigned long *)addr = val; +} + +void __kprobes kprobes_optinsn_template_holder(void) +{ + asm volatile ( + ".global optprobe_template_entry\n" + "optprobe_template_entry: \n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rsi\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + /* Move flags to rsp */ + " movq 144(%rsp), %rdx\n" + " movq %rdx, 152(%rsp)\n" + RESTORE_REGS_STRING + /* Skip flags entry */ + " addq $8, %rsp\n" + " popfq\n" +#else /* CONFIG_X86_32 */ + " pushf\n" + SAVE_REGS_STRING + " movl %esp, %edx\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + RESTORE_REGS_STRING + " addl $4, %esp\n" /* skip cs */ + " popf\n" +#endif + ".global optprobe_template_end\n" + "optprobe_template_end: \n"); +} + +#define TMPL_MOVE_IDX \ + ((long)&optprobe_template_val - (long)&optprobe_template_entry) +#define TMPL_CALL_IDX \ + ((long)&optprobe_template_call - (long)&optprobe_template_entry) +#define TMPL_END_IDX \ + ((long)&optprobe_template_end - (long)&optprobe_template_entry) + +#define INT3_SIZE sizeof(kprobe_opcode_t) + +/* Optimized kprobe call back function: called from optinsn */ +static void __kprobes optimized_callback(struct optimized_kprobe *op, + struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + preempt_disable(); + if (kprobe_running()) { + kprobes_inc_nmissed_count(&op->kp); + } else { + /* Save skipped registers */ +#ifdef CONFIG_X86_64 + regs->cs = __KERNEL_CS; +#else + regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; +#endif + regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; + regs->orig_ax = ~0UL; + + __get_cpu_var(current_kprobe) = &op->kp; + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + opt_pre_handler(&op->kp, regs); + __get_cpu_var(current_kprobe) = NULL; + } + preempt_enable_no_resched(); +} + +static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) +{ + int len = 0, ret; + + while (len < RELATIVEJUMP_SIZE) { + ret = __copy_instruction(dest + len, src + len, 1); + if (!ret || !can_boost(dest + len)) + return -EINVAL; + len += ret; + } + /* Check whether the address range is reserved */ + if (ftrace_text_reserved(src, src + len - 1) || + alternatives_text_reserved(src, src + len - 1)) + return -EBUSY; + + return len; +} + +/* Check whether insn is indirect jump */ +static int __kprobes insn_is_indirect_jump(struct insn *insn) +{ + return ((insn->opcode.bytes[0] == 0xff && + (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ + insn->opcode.bytes[0] == 0xea); /* Segment based jump */ +} + +/* Check whether insn jumps into specified address range */ +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) +{ + unsigned long target = 0; + + switch (insn->opcode.bytes[0]) { + case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + case 0xe3: /* jcxz */ + case 0xe9: /* near relative jump */ + case 0xeb: /* short relative jump */ + break; + case 0x0f: + if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ + break; + return 0; + default: + if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ + break; + return 0; + } + target = (unsigned long)insn->next_byte + insn->immediate.value; + + return (start <= target && target <= start + len); +} + +/* Decode whole function to ensure any instructions don't jump into target */ +static int __kprobes can_optimize(unsigned long paddr) +{ + int ret; + unsigned long addr, size = 0, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + /* Dummy buffers for lookup_symbol_attrs */ + static char __dummy_buf[KSYM_NAME_LEN]; + + /* Lookup symbol including addr */ + if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf)) + return 0; + + /* Check there is enough space for a relative jump. */ + if (size - offset < RELATIVEJUMP_SIZE) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr - offset + size) { /* Decode until function end */ + if (search_exception_tables(addr)) + /* + * Since some fixup code will jumps into this function, + * we can't optimize kprobe in this function. + */ + return 0; + kernel_insn_init(&insn, (void *)addr); + insn_get_opcode(&insn); + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) { + ret = recover_probed_instruction(buf, addr); + if (ret) + return 0; + kernel_insn_init(&insn, buf); + } + insn_get_length(&insn); + /* Recover address */ + insn.kaddr = (void *)addr; + insn.next_byte = (void *)(addr + insn.length); + /* Check any instructions don't jump into target */ + if (insn_is_indirect_jump(&insn) || + insn_jump_into_range(&insn, paddr + INT3_SIZE, + RELATIVE_ADDR_SIZE)) + return 0; + addr += insn.length; + } + + return 1; +} + +/* Check optimized_kprobe can actually be optimized. */ +int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ + int i; + struct kprobe *p; + + for (i = 1; i < op->optinsn.size; i++) { + p = get_kprobe(op->kp.addr + i); + if (p && !kprobe_disabled(p)) + return -EEXIST; + } + + return 0; +} + +/* Check the addr is within the optimized instructions. */ +int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op, + unsigned long addr) +{ + return ((unsigned long)op->kp.addr <= addr && + (unsigned long)op->kp.addr + op->optinsn.size > addr); +} + +/* Free optimized instruction slot */ +static __kprobes +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) +{ + if (op->optinsn.insn) { + free_optinsn_slot(op->optinsn.insn, dirty); + op->optinsn.insn = NULL; + op->optinsn.size = 0; + } +} + +void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ + __arch_remove_optimized_kprobe(op, 1); +} + +/* + * Copy replacing target instructions + * Target instructions MUST be relocatable (checked inside) + */ +int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +{ + u8 *buf; + int ret; + long rel; + + if (!can_optimize((unsigned long)op->kp.addr)) + return -EILSEQ; + + op->optinsn.insn = get_optinsn_slot(); + if (!op->optinsn.insn) + return -ENOMEM; + + /* + * Verify if the address gap is in 2GB range, because this uses + * a relative jump. + */ + rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; + if (abs(rel) > 0x7fffffff) + return -ERANGE; + + buf = (u8 *)op->optinsn.insn; + + /* Copy instructions into the out-of-line buffer */ + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); + if (ret < 0) { + __arch_remove_optimized_kprobe(op, 0); + return ret; + } + op->optinsn.size = ret; + + /* Copy arch-dep-instance from template */ + memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + + /* Set probe information */ + synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); + + /* Set probe function call */ + synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + + /* Set returning jmp instruction at the tail of out-of-line buffer */ + synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, + (u8 *)op->kp.addr + op->optinsn.size); + + flush_icache_range((unsigned long) buf, + (unsigned long) buf + TMPL_END_IDX + + op->optinsn.size + RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a breakpoint (int3) with a relative jump. */ +int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) +{ + unsigned char jmp_code[RELATIVEJUMP_SIZE]; + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, + RELATIVE_ADDR_SIZE); + + jmp_code[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&jmp_code[1]) = rel; + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a relative jump with a breakpoint (int3). */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ + u8 buf[RELATIVEJUMP_SIZE]; + + /* Set int3 to first byte for kprobes */ + buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); +} + +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter) +{ + struct optimized_kprobe *op; + + if (p->flags & KPROBE_FLAG_OPTIMIZED) { + /* This kprobe is really able to run optimized path. */ + op = container_of(p, struct optimized_kprobe, kp); + /* Detour through copied instructions */ + regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; + if (!reenter) + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + return 0; +} +#endif + int __init arch_init_kprobes(void) { return 0; -- cgit v1.2.3-70-g09d2 From d5d0e88c1e5b069aadb050ff6ec95df312de876a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Feb 2010 05:42:04 -0800 Subject: x86, olpc: Use pci subarch init for OLPC Replace the #ifdef'ed OLPC-specific init functions by a conditional x86_init function. If the function returns 0 we leave pci_arch_init, otherwise we continue. Signed-off-by: Thomas Gleixner Cc: Jesse Barnes Cc: Andres Salomon LKML-Reference: <43F901BD926A4E43B106BF17856F0755A318CE89@orsmsx508.amr.corp.intel.com> Signed-off-by: Jacob Pan Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/olpc.h | 20 ++------------------ arch/x86/include/asm/pci_x86.h | 1 - arch/x86/kernel/olpc.c | 10 +++++++--- arch/x86/pci/init.c | 8 ++++---- arch/x86/pci/olpc.c | 3 --- 5 files changed, 13 insertions(+), 29 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h index 3a57385d9fa..101229b0d8e 100644 --- a/arch/x86/include/asm/olpc.h +++ b/arch/x86/include/asm/olpc.h @@ -13,7 +13,6 @@ struct olpc_platform_t { #define OLPC_F_PRESENT 0x01 #define OLPC_F_DCON 0x02 -#define OLPC_F_VSA 0x04 #ifdef CONFIG_OLPC @@ -50,18 +49,6 @@ static inline int olpc_has_dcon(void) return (olpc_platform_info.flags & OLPC_F_DCON) ? 1 : 0; } -/* - * The VSA is software from AMD that typical Geode bioses will include. - * It is used to emulate the PCI bus, VGA, etc. OLPC's Open Firmware does - * not include the VSA; instead, PCI is emulated by the kernel. - * - * The VSA is described further in arch/x86/pci/olpc.c. - */ -static inline int olpc_has_vsa(void) -{ - return (olpc_platform_info.flags & OLPC_F_VSA) ? 1 : 0; -} - /* * The "Mass Production" version of OLPC's XO is identified as being model * C2. During the prototype phase, the following models (in chronological @@ -87,13 +74,10 @@ static inline int olpc_has_dcon(void) return 0; } -static inline int olpc_has_vsa(void) -{ - return 0; -} - #endif +extern int pci_olpc_init(void); + /* EC related functions */ extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 6e69edfbf07..36085518bad 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -104,7 +104,6 @@ extern bool port_cf9_safe; extern int pci_direct_probe(void); extern void pci_direct_init(int type); extern void pci_pcbios_init(void); -extern int pci_olpc_init(void); extern void __init dmi_check_pciprobe(void); extern void __init dmi_check_skip_isa_align(void); diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 9d1d263f786..8297160c41b 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -17,7 +17,9 @@ #include #include #include + #include +#include #include #ifdef CONFIG_OPEN_FIRMWARE @@ -243,9 +245,11 @@ static int __init olpc_init(void) olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, (unsigned char *) &olpc_platform_info.ecver, 1); - /* check to see if the VSA exists */ - if (cs5535_has_vsa2()) - olpc_platform_info.flags |= OLPC_F_VSA; +#ifdef CONFIG_PCI_OLPC + /* If the VSA exists let it emulate PCI, if not emulate in kernel */ + if (!cs5535_has_vsa2()) + x86_init.pci.arch_init = pci_olpc_init; +#endif printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c index 25a1f8efed4..adb62aaa7ec 100644 --- a/arch/x86/pci/init.c +++ b/arch/x86/pci/init.c @@ -1,6 +1,7 @@ #include #include #include +#include /* arch_initcall has too random ordering, so call the initializers in the right sequence from here. */ @@ -15,10 +16,9 @@ static __init int pci_arch_init(void) if (!(pci_probe & PCI_PROBE_NOEARLY)) pci_mmcfg_early_init(); -#ifdef CONFIG_PCI_OLPC - if (!pci_olpc_init()) - return 0; /* skip additional checks if it's an XO */ -#endif + if (x86_init.pci.arch_init && !x86_init.pci.arch_init()) + return 0; + #ifdef CONFIG_PCI_BIOS pci_pcbios_init(); #endif diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c index b889d824f7c..b34815408f5 100644 --- a/arch/x86/pci/olpc.c +++ b/arch/x86/pci/olpc.c @@ -304,9 +304,6 @@ static struct pci_raw_ops pci_olpc_conf = { int __init pci_olpc_init(void) { - if (!machine_is_olpc() || olpc_has_vsa()) - return -ENODEV; - printk(KERN_INFO "PCI: Using configuration type OLPC\n"); raw_pci_ops = &pci_olpc_conf; is_lx = is_geode_lx(); -- cgit v1.2.3-70-g09d2 From fb90ef93df654f2678933efbbf864adac0ae490e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 24 Feb 2010 18:36:53 -0800 Subject: early_res: Add free_early_partial() To free partial areas in pcpu_setup... Reported-by: Peter Zijlstra Signed-off-by: Yinghai Lu Cc: Tejun Heo Cc: Christoph Lameter Cc: Stephen Rothwell Cc: Linus Torvalds Cc: Jesse Barnes Cc: Pekka Enberg LKML-Reference: <4B85E245.5030001@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 6 +++++ include/linux/early_res.h | 1 + kernel/early_res.c | 55 ++++++++++++++++++++++++++++++++++++++++++ mm/percpu.c | 3 --- 4 files changed, 62 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 35abcb8b00e..ef6370b00e7 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -137,7 +137,13 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) static void __init pcpu_fc_free(void *ptr, size_t size) { +#ifdef CONFIG_NO_BOOTMEM + u64 start = __pa(ptr); + u64 end = start + size; + free_early_partial(start, end); +#else free_bootmem(__pa(ptr), size); +#endif } static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) diff --git a/include/linux/early_res.h b/include/linux/early_res.h index 50f7663bb8b..29c09f57a13 100644 --- a/include/linux/early_res.h +++ b/include/linux/early_res.h @@ -5,6 +5,7 @@ extern void reserve_early(u64 start, u64 end, char *name); extern void reserve_early_overlap_ok(u64 start, u64 end, char *name); extern void free_early(u64 start, u64 end); +void free_early_partial(u64 start, u64 end); extern void early_res_to_bootmem(u64 start, u64 end); void reserve_early_without_check(u64 start, u64 end, char *name); diff --git a/kernel/early_res.c b/kernel/early_res.c index aa5494ac446..9ab11cd8485 100644 --- a/kernel/early_res.c +++ b/kernel/early_res.c @@ -61,6 +61,40 @@ static void __init drop_range(int i) early_res_count--; } +static void __init drop_range_partial(int i, u64 start, u64 end) +{ + u64 common_start, common_end; + u64 old_start, old_end; + + old_start = early_res[i].start; + old_end = early_res[i].end; + common_start = max(old_start, start); + common_end = min(old_end, end); + + /* no overlap ? */ + if (common_start >= common_end) + return; + + if (old_start < common_start) { + /* make head segment */ + early_res[i].end = common_start; + if (old_end > common_end) { + /* add another for left over on tail */ + reserve_early_without_check(common_end, old_end, + early_res[i].name); + } + return; + } else { + if (old_end > common_end) { + /* reuse the entry for tail left */ + early_res[i].start = common_end; + return; + } + /* all covered */ + drop_range(i); + } +} + /* * Split any existing ranges that: * 1) are marked 'overlap_ok', and @@ -284,6 +318,27 @@ void __init free_early(u64 start, u64 end) drop_range(i); } +void __init free_early_partial(u64 start, u64 end) +{ + struct early_res *r; + int i; + +try_next: + i = find_overlapped_early(start, end); + if (i >= max_early_res) + return; + + r = &early_res[i]; + /* hole ? */ + if (r->end >= end && r->start <= start) { + drop_range_partial(i, start, end); + return; + } + + drop_range_partial(i, start, end); + goto try_next; +} + #ifdef CONFIG_NO_BOOTMEM static void __init subtract_early_res(struct range *range, int az) { diff --git a/mm/percpu.c b/mm/percpu.c index 841defeeef8..083e7c91e5f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1929,10 +1929,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, } /* copy and return the unused part */ memcpy(ptr, __per_cpu_load, ai->static_size); -#ifndef CONFIG_NO_BOOTMEM - /* fix partial free ! */ free_fn(ptr + size_sum, ai->unit_size - size_sum); -#endif } } -- cgit v1.2.3-70-g09d2 From 78c06176466cbd1b3f0f67709d3023c40dbebcbd Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 26 Feb 2010 10:49:12 -0600 Subject: x86: Enable NMI on all cpus on UV Enable NMI on all cpus in UV system and add an NMI handler to dump_stack on each cpu. By default on x86 all the cpus except the boot cpu have NMI masked off. This patch enables NMI on all cpus in UV system and adds an NMI handler to dump_stack on each cpu. This way if a system hangs we can NMI the machine and get a backtrace from all the cpus. Version 2: Use x86_platform driver mechanism for nmi init, per Ingo's suggestion. Version 3: Clean up Ingo's nits. Signed-off-by: Russ Anderson LKML-Reference: <20100226164912.GA24439@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv.h | 1 + arch/x86/include/asm/x86_init.h | 2 ++ arch/x86/kernel/apic/x2apic_uv_x.c | 44 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/smpboot.c | 1 + arch/x86/kernel/x86_init.c | 3 +++ 5 files changed, 51 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index c0a01b5d985..3bb9491b765 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -11,6 +11,7 @@ struct mm_struct; extern enum uv_system_type get_uv_system_type(void); extern int is_uv_system(void); extern void uv_cpu_init(void); +extern void uv_nmi_init(void); extern void uv_system_init(void); extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index ea0e8ea15e1..60cc3526908 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -126,6 +126,7 @@ struct x86_cpuinit_ops { * @get_wallclock: get time from HW clock like RTC etc. * @set_wallclock: set time back to HW clock * @is_untracked_pat_range exclude from PAT logic + * @nmi_init enable NMI on cpus */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -133,6 +134,7 @@ struct x86_platform_ops { int (*set_wallclock)(unsigned long nowtime); void (*iommu_shutdown)(void); bool (*is_untracked_pat_range)(u64 start, u64 end); + void (*nmi_init)(void); }; extern struct x86_init_ops x86_init; diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 6ef2899eb86..4b8dbb25614 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,7 @@ static enum uv_system_type uv_system_type; static u64 gru_start_paddr, gru_end_paddr; int uv_min_hub_revision_id; EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); +static DEFINE_SPINLOCK(uv_nmi_lock); static inline bool is_GRU_range(u64 start, u64 end) { @@ -74,6 +76,7 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (!strcmp(oem_id, "SGI")) { nodeid = early_get_nodeid(); x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; + x86_platform.nmi_init = uv_nmi_init; if (!strcmp(oem_table_id, "UVL")) uv_system_type = UV_LEGACY_APIC; else if (!strcmp(oem_table_id, "UVX")) @@ -596,6 +599,46 @@ void __cpuinit uv_cpu_init(void) set_x2apic_extra_bits(uv_hub_info->pnode); } +/* + * When NMI is received, print a stack trace. + */ +int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) +{ + if (reason != DIE_NMI_IPI) + return NOTIFY_OK; + /* + * Use a lock so only one cpu prints at a time + * to prevent intermixed output. + */ + spin_lock(&uv_nmi_lock); + pr_info("NMI stack dump cpu %u:\n", smp_processor_id()); + dump_stack(); + spin_unlock(&uv_nmi_lock); + + return NOTIFY_STOP; +} + +static struct notifier_block uv_dump_stack_nmi_nb = { + .notifier_call = uv_handle_nmi +}; + +void uv_register_nmi_notifier(void) +{ + if (register_die_notifier(&uv_dump_stack_nmi_nb)) + printk(KERN_WARNING "UV NMI handler failed to register\n"); +} + +void uv_nmi_init(void) +{ + unsigned int value; + + /* + * Unmask NMI on all cpus + */ + value = apic_read(APIC_LVT1) | APIC_DM_NMI; + value &= ~APIC_LVT_MASKED; + apic_write(APIC_LVT1, value); +} void __init uv_system_init(void) { @@ -717,6 +760,7 @@ void __init uv_system_init(void) uv_cpu_init(); uv_scir_register_cpu_notifier(); + uv_register_nmi_notifier(); proc_mkdir("sgi_uv", NULL); /* register Legacy VGA I/O redirection handler */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 678d0b8c26f..838a118876c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -320,6 +320,7 @@ notrace static void __cpuinit start_secondary(void *unused) unlock_vector_lock(); ipi_call_unlock(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + x86_platform.nmi_init(); /* enable local interrupts */ local_irq_enable(); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index ccd179dec36..ee5746c9462 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -76,10 +76,13 @@ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { .setup_percpu_clockev = setup_secondary_APIC_clock, }; +static void default_nmi_init(void) { }; + struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, .iommu_shutdown = iommu_shutdown_noop, .is_untracked_pat_range = is_ISA_range, + .nmi_init = default_nmi_init }; -- cgit v1.2.3-70-g09d2 From 21c2fd9970cc8e457058f84016450a2e9876125e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 26 Feb 2010 11:17:16 +0100 Subject: x86: apic: Fix mismerge, add arch_probe_nr_irqs() again Merge commit aef55d4922 mis-merged io_apic.c so we lost the arch_probe_nr_irqs() method. This caused subtle boot breakages (udev confusion likely due to missing drivers) with certain configs. Cc: H. Peter Anvin Cc: Yinghai Lu LKML-Reference: <20100207210250.GB8256@jenkins.home.ifup.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 979589881c8..72ac2a33299 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -3876,6 +3876,28 @@ void __init probe_nr_irqs_gsi(void) printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } +#ifdef CONFIG_SPARSE_IRQ +int __init arch_probe_nr_irqs(void) +{ + int nr; + + if (nr_irqs > (NR_VECTORS * nr_cpu_ids)) + nr_irqs = NR_VECTORS * nr_cpu_ids; + + nr = nr_irqs_gsi + 8 * nr_cpu_ids; +#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ) + /* + * for MSI and HT dyn irq + */ + nr += nr_irqs_gsi * 16; +#endif + if (nr < nr_irqs) + nr_irqs = nr; + + return 0; +} +#endif + static int __io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { -- cgit v1.2.3-70-g09d2 From 3249b7e1df6380e9d7bb3238f64f445bf614f787 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 26 Feb 2010 17:16:01 +0000 Subject: x86, vmi: Disable highmem PTE allocation even when CONFIG_HIGHPTE=y Preventing HIGHPTE allocations under VMI will allow us to remove the kmap_atomic_pte paravirt op. Signed-off-by: Ian Campbell LKML-Reference: <1267204562-11844-2-git-send-email-ian.campbell@citrix.com> Acked-by: Alok Kataria Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmi_32.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index d430e4c3019..58aca86193e 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -272,19 +273,11 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) void *va = kmap_atomic(page, type); /* - * Internally, the VMI ROM must map virtual addresses to physical - * addresses for processing MMU updates. By the time MMU updates - * are issued, this information is typically already lost. - * Fortunately, the VMI provides a cache of mapping slots for active - * page tables. - * - * We use slot zero for the linear mapping of physical memory, and - * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1. - * - * args: SLOT VA COUNT PFN + * We disable highmem allocations for page tables so we should never + * see any calls to kmap_atomic_pte on a highmem page. */ - BUG_ON(type != KM_PTE0 && type != KM_PTE1); - vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page)); + + BUG_ON(PageHighmem(page)); return va; } @@ -640,6 +633,12 @@ static inline int __init activate_vmi(void) u64 reloc; const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; + /* + * Prevent page tables from being allocated in highmem, even if + * CONFIG_HIGHPTE is enabled. + */ + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + if (call_vrom_func(vmi_rom, vmi_init) != 0) { printk(KERN_ERR "VMI ROM failed to initialize!"); return 0; -- cgit v1.2.3-70-g09d2 From dad52fc01161afcb8798c609e009aed4d104927f Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 26 Feb 2010 17:16:02 +0000 Subject: x86, paravirt: Remove kmap_atomic_pte paravirt op. Now that both Xen and VMI disable allocations of PTE pages from high memory this paravirt op serves no further purpose. This effectively reverts ce6234b5 "add kmap_atomic_pte for mapping highpte pages". Signed-off-by: Ian Campbell LKML-Reference: <1267204562-11844-3-git-send-email-ian.campbell@citrix.com> Acked-by: Alok Kataria Cc: Jeremy Fitzhardinge Cc: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/highmem.h | 4 ---- arch/x86/include/asm/paravirt.h | 9 --------- arch/x86/include/asm/paravirt_types.h | 4 ---- arch/x86/include/asm/pgtable_32.h | 4 ++-- arch/x86/kernel/paravirt.c | 4 ---- arch/x86/kernel/vmi_32.c | 20 -------------------- arch/x86/xen/mmu.c | 22 ---------------------- 7 files changed, 2 insertions(+), 65 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 014c2b85ae4..a726650fc80 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -66,10 +66,6 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); struct page *kmap_atomic_to_page(void *ptr); -#ifndef CONFIG_PARAVIRT -#define kmap_atomic_pte(page, type) kmap_atomic(page, type) -#endif - #define flush_cache_kmaps() do { } while (0) extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index dd59a85a918..5653f43d90e 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -435,15 +435,6 @@ static inline void paravirt_release_pud(unsigned long pfn) PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); } -#ifdef CONFIG_HIGHPTE -static inline void *kmap_atomic_pte(struct page *page, enum km_type type) -{ - unsigned long ret; - ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type); - return (void *)ret; -} -#endif - static inline void pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b1e70d51e40..db9ef553234 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -304,10 +304,6 @@ struct pv_mmu_ops { #endif /* PAGETABLE_LEVELS == 4 */ #endif /* PAGETABLE_LEVELS >= 3 */ -#ifdef CONFIG_HIGHPTE - void *(*kmap_atomic_pte)(struct page *page, enum km_type type); -#endif - struct pv_lazy_ops lazy_mode; /* dom0 ops */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 01fd9461d32..b422d2201af 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -54,10 +54,10 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t); in_irq() ? KM_IRQ_PTE : \ KM_PTE0) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \ pte_index((address))) #define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \ pte_index((address))) #define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1b1739d1631..1db183ed7c0 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = { .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = kmap_atomic, -#endif - #if PAGETABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 58aca86193e..7dd599deca4 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -267,22 +267,6 @@ static void vmi_nop(void) { } -#ifdef CONFIG_HIGHPTE -static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) -{ - void *va = kmap_atomic(page, type); - - /* - * We disable highmem allocations for page tables so we should never - * see any calls to kmap_atomic_pte on a highmem page. - */ - - BUG_ON(PageHighmem(page)); - - return va; -} -#endif - static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) { vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); @@ -777,10 +761,6 @@ static inline int __init activate_vmi(void) /* Set linear is needed in all cases */ vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); -#ifdef CONFIG_HIGHPTE - if (vmi_ops.set_linear_mapping) - pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; -#endif /* * These MUST always be patched. Don't support indirect jumps diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 350a3deedf2..f9eb7de74f4 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1427,24 +1427,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) #endif } -#ifdef CONFIG_HIGHPTE -static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) -{ - pgprot_t prot = PAGE_KERNEL; - - /* - * We disable highmem allocations for page tables so we should never - * see any calls to kmap_atomic_pte on a highmem page. - */ - BUG_ON(PageHighMem(page)); - - if (PagePinned(page)) - prot = PAGE_KERNEL_RO; - - return kmap_atomic_prot(page, type, prot); -} -#endif - #ifdef CONFIG_X86_32 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { @@ -1903,10 +1885,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .alloc_pmd_clone = paravirt_nop, .release_pmd = xen_release_pmd_init, -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = xen_kmap_atomic_pte, -#endif - #ifdef CONFIG_X86_64 .set_pte = xen_set_pte, #else -- cgit v1.2.3-70-g09d2 From fad539956c9e69749a03f7817d22d1bab87657bf Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 28 Feb 2010 01:06:34 -0800 Subject: x86: Fix out of order of gsi Iranna D Ankad reported that IBM x3950 systems have boot problems after this commit: | | commit b9c61b70075c87a8612624736faf4a2de5b1ed30 | | x86/pci: update pirq_enable_irq() to setup io apic routing | The problem is that with the patch, the machine freezes when console=ttyS0,... kernel serial parameter is passed. It seem to freeze at DVD initialization and the whole problem seem to be DVD/pata related, but somehow exposed through the serial parameter. Such apic problems can expose really weird behavior: ACPI: IOAPIC (id[0x10] address[0xfecff000] gsi_base[0]) IOAPIC[0]: apic_id 16, version 0, address 0xfecff000, GSI 0-2 ACPI: IOAPIC (id[0x0f] address[0xfec00000] gsi_base[3]) IOAPIC[1]: apic_id 15, version 0, address 0xfec00000, GSI 3-38 ACPI: IOAPIC (id[0x0e] address[0xfec01000] gsi_base[39]) IOAPIC[2]: apic_id 14, version 0, address 0xfec01000, GSI 39-74 ACPI: INT_SRC_OVR (bus 0 bus_irq 1 global_irq 4 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 0 global_irq 5 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 3 global_irq 6 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 4 global_irq 7 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 6 global_irq 9 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 7 global_irq 10 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 8 global_irq 11 low edge) ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 12 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 12 global_irq 15 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 13 global_irq 16 dfl dfl) ACPI: INT_SRC_OVR (bus 0 bus_irq 14 global_irq 17 low edge) ACPI: INT_SRC_OVR (bus 0 bus_irq 15 global_irq 18 dfl dfl) It turns out that the system has three io apic controllers, but boot ioapic routing is in the second one, and that gsi_base is not 0 - it is using a bunch of INT_SRC_OVR... So these recent changes: 1. one set routing for first io apic controller 2. assume irq = gsi ... will break that system. So try to remap those gsis, need to seperate boot_ioapic_idx detection out of enable_IO_APIC() and call them early. So introduce boot_ioapic_idx, and remap_ioapic_gsi()... -v2: shift gsi with delta instead of gsi_base of boot_ioapic_idx -v3: double check with find_isa_irq_apic(0, mp_INT) to get right boot_ioapic_idx -v4: nr_legacy_irqs -v5: add print out for boot_ioapic_idx, and also make it could be applied for current kernel and previous kernel -v6: add bus_irq, in acpi_sci_ioapic_setup, so can get overwride for sci right mapping... -v7: looks like pnpacpi get irq instead of gsi, so need to revert them back... -v8: split into two patches -v9: according to Eric, use fixed 16 for shifting instead of remap -v10: still need to touch rsparser.c -v11: just revert back to way Eric suggest... anyway the ioapic in first ioapic is blocked by second... -v12: two patches, this one will add more loop but check apic_id and irq > 16 Reported-by: Iranna D Ankad Bisected-by: Iranna D Ankad Tested-by: Gary Hade Signed-off-by: Yinghai Lu Cc: Eric W. Biederman Cc: Thomas Renninger Cc: Eric W. Biederman Cc: Suresh Siddha Cc: len.brown@intel.com LKML-Reference: <4B8A321A.1000008@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/io_apic.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 72ac2a33299..97e1e3ec2ed 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1475,7 +1475,7 @@ static struct { static void __init setup_IO_APIC_irqs(void) { - int apic_id = 0, pin, idx, irq; + int apic_id, pin, idx, irq; int notcon = 0; struct irq_desc *desc; struct irq_cfg *cfg; @@ -1483,14 +1483,7 @@ static void __init setup_IO_APIC_irqs(void) apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); -#ifdef CONFIG_ACPI - if (!acpi_disabled && acpi_ioapic) { - apic_id = mp_find_ioapic(0); - if (apic_id < 0) - apic_id = 0; - } -#endif - + for (apic_id = 0; apic_id < nr_ioapics; apic_id++) for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { idx = find_irq_entry(apic_id, pin, mp_INT); if (idx == -1) { @@ -1512,6 +1505,9 @@ static void __init setup_IO_APIC_irqs(void) irq = pin_2_irq(idx, apic_id, pin); + if ((apic_id > 0) && (irq > 16)) + continue; + /* * Skip the timer IRQ if there's a quirk handler * installed and if it returns 1: @@ -4105,27 +4101,23 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) #ifdef CONFIG_SMP void __init setup_ioapic_dest(void) { - int pin, ioapic = 0, irq, irq_entry; + int pin, ioapic, irq, irq_entry; struct irq_desc *desc; const struct cpumask *mask; if (skip_ioapic_setup == 1) return; -#ifdef CONFIG_ACPI - if (!acpi_disabled && acpi_ioapic) { - ioapic = mp_find_ioapic(0); - if (ioapic < 0) - ioapic = 0; - } -#endif - + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { irq_entry = find_irq_entry(ioapic, pin, mp_INT); if (irq_entry == -1) continue; irq = pin_2_irq(irq_entry, ioapic, pin); + if ((ioapic > 0) && (irq > 16)) + continue; + desc = irq_to_desc(irq); /* -- cgit v1.2.3-70-g09d2 From be43f83dada2cf0e9e01c9a0ba42977c5bd70f9d Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Fri, 18 Dec 2009 16:48:45 +0800 Subject: x86: Raise vsyscall priority on hotplug notifier chain KVM need vsyscall_init() to initialize MSR_TSC_AUX before it read the value. Per Avi's suggestion, this patch raised vsyscall priority on hotplug notifier chain, to 30. CC: Ingo Molnar CC: linux-kernel@vger.kernel.org Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kernel/vsyscall_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 9055e5872ff..1c0c6ab9c60 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -301,7 +301,8 @@ static int __init vsyscall_init(void) register_sysctl_table(kernel_root_table2); #endif on_each_cpu(cpu_vsyscall_init, NULL, 1); - hotcpu_notifier(cpu_vsyscall_notifier, 0); + /* notifier priority > KVM */ + hotcpu_notifier(cpu_vsyscall_notifier, 30); return 0; } -- cgit v1.2.3-70-g09d2 From 3010673ef5f7bef4b4685566a0713de1b4306c93 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Tue, 2 Mar 2010 21:01:34 -0800 Subject: x86, mrst: Fix APB timer per cpu clockevent The current APB timer code incorrectly registers a static copy of the clockevent device for the boot CPU. The per cpu clockevent should be used instead. This bug was hidden by zero-initialized data; as such it did not get exposed in testing, but was discovered by code review. Signed-off-by: Jacob Pan LKML-Reference: <1267592494-7723-1-git-send-email-jacob.jun.pan@linux.intel.com> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apb_timer.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 83a345b0256..6f27f8b7579 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -84,9 +84,10 @@ struct apbt_dev { int disable_apbt_percpu __cpuinitdata; +static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); + #ifdef CONFIG_SMP static unsigned int apbt_num_timers_used; -static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); static struct apbt_dev *apbt_devs; #endif @@ -302,6 +303,7 @@ static void apbt_disable_int(int n) static int __init apbt_clockevent_register(void) { struct sfi_timer_table_entry *mtmr; + struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev); mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); if (mtmr == NULL) { @@ -329,22 +331,24 @@ static int __init apbt_clockevent_register(void) * global if not used for per cpu timer. */ apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); + adev->num = smp_processor_id(); + memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); if (disable_apbt_percpu) { apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; - global_clock_event = &apbt_clockevent; + global_clock_event = &adev->evt; printk(KERN_DEBUG "%s clockevent registered as global\n", global_clock_event->name); } if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, - apbt_clockevent.name, &apbt_clockevent)) { + apbt_clockevent.name, adev)) { printk(KERN_ERR "Failed request IRQ for APBT%d\n", apbt_clockevent.irq); } - clockevents_register_device(&apbt_clockevent); + clockevents_register_device(&adev->evt); /* Start APBT 0 interrupts */ apbt_enable_int(APBT_CLOCKEVENT0_NUM); -- cgit v1.2.3-70-g09d2 From c7bbf52aa4fa332b84c4f2bb33e69561ee6870b4 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 3 Mar 2010 13:38:48 -0800 Subject: x86, mrst: Fix whitespace breakage in apb_timer.c Checkin bb24c4716185f6e116c440462c65c1f56649183b: "Moorestown APB system timer driver" suffered from severe whitespace damage in arch/x86/kernel/apb_timer.c due to using Microsoft Lookout to send a patch. Fix the whitespace breakage. Reported-by: Jacob Pan Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apb_timer.c | 1068 +++++++++++++++++++++---------------------- 1 file changed, 534 insertions(+), 534 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 6f27f8b7579..2afa27d0129 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -43,11 +43,11 @@ #include #include -#define APBT_MASK CLOCKSOURCE_MASK(32) -#define APBT_SHIFT 22 -#define APBT_CLOCKEVENT_RATING 150 -#define APBT_CLOCKSOURCE_RATING 250 -#define APBT_MIN_DELTA_USEC 200 +#define APBT_MASK CLOCKSOURCE_MASK(32) +#define APBT_SHIFT 22 +#define APBT_CLOCKEVENT_RATING 150 +#define APBT_CLOCKSOURCE_RATING 250 +#define APBT_MIN_DELTA_USEC 200 #define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt) #define APBT_CLOCKEVENT0_NUM (0) @@ -65,21 +65,21 @@ static int phy_cs_timer_id; static uint64_t apbt_freq; static void apbt_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt); + struct clock_event_device *evt); static int apbt_next_event(unsigned long delta, - struct clock_event_device *evt); + struct clock_event_device *evt); static cycle_t apbt_read_clocksource(struct clocksource *cs); static void apbt_restart_clocksource(void); struct apbt_dev { - struct clock_event_device evt; - unsigned int num; - int cpu; - unsigned int irq; - unsigned int tick; - unsigned int count; - unsigned int flags; - char name[10]; + struct clock_event_device evt; + unsigned int num; + int cpu; + unsigned int irq; + unsigned int tick; + unsigned int count; + unsigned int flags; + char name[10]; }; int disable_apbt_percpu __cpuinitdata; @@ -91,77 +91,77 @@ static unsigned int apbt_num_timers_used; static struct apbt_dev *apbt_devs; #endif -static inline unsigned long apbt_readl_reg(unsigned long a) +static inline unsigned long apbt_readl_reg(unsigned long a) { - return readl(apbt_virt_address + a); + return readl(apbt_virt_address + a); } static inline void apbt_writel_reg(unsigned long d, unsigned long a) { - writel(d, apbt_virt_address + a); + writel(d, apbt_virt_address + a); } static inline unsigned long apbt_readl(int n, unsigned long a) { - return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE); + return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE); } static inline void apbt_writel(int n, unsigned long d, unsigned long a) { - writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE); + writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE); } static inline void apbt_set_mapping(void) { - struct sfi_timer_table_entry *mtmr; - - if (apbt_virt_address) { - pr_debug("APBT base already mapped\n"); - return; - } - mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); - if (mtmr == NULL) { - printk(KERN_ERR "Failed to get MTMR %d from SFI\n", - APBT_CLOCKEVENT0_NUM); - return; - } - apbt_address = (unsigned long)mtmr->phys_addr; - if (!apbt_address) { - printk(KERN_WARNING "No timer base from SFI, use default\n"); - apbt_address = APBT_DEFAULT_BASE; - } - apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); - if (apbt_virt_address) { - pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\ - (void *)apbt_address, (void *)apbt_virt_address); - } else { - pr_debug("Failed mapping APBT phy address at %p\n",\ - (void *)apbt_address); - goto panic_noapbt; - } - apbt_freq = mtmr->freq_hz / USEC_PER_SEC; - sfi_free_mtmr(mtmr); - - /* Now figure out the physical timer id for clocksource device */ - mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); - if (mtmr == NULL) - goto panic_noapbt; - - /* Now figure out the physical timer id */ - phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) - / APBTMRS_REG_SIZE; - pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id); - return; + struct sfi_timer_table_entry *mtmr; + + if (apbt_virt_address) { + pr_debug("APBT base already mapped\n"); + return; + } + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return; + } + apbt_address = (unsigned long)mtmr->phys_addr; + if (!apbt_address) { + printk(KERN_WARNING "No timer base from SFI, use default\n"); + apbt_address = APBT_DEFAULT_BASE; + } + apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE); + if (apbt_virt_address) { + pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\ + (void *)apbt_address, (void *)apbt_virt_address); + } else { + pr_debug("Failed mapping APBT phy address at %p\n",\ + (void *)apbt_address); + goto panic_noapbt; + } + apbt_freq = mtmr->freq_hz / USEC_PER_SEC; + sfi_free_mtmr(mtmr); + + /* Now figure out the physical timer id for clocksource device */ + mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM); + if (mtmr == NULL) + goto panic_noapbt; + + /* Now figure out the physical timer id */ + phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) + / APBTMRS_REG_SIZE; + pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id); + return; panic_noapbt: - panic("Failed to setup APB system timer\n"); + panic("Failed to setup APB system timer\n"); } static inline void apbt_clear_mapping(void) { - iounmap(apbt_virt_address); - apbt_virt_address = NULL; + iounmap(apbt_virt_address); + apbt_virt_address = NULL; } /* @@ -169,28 +169,28 @@ static inline void apbt_clear_mapping(void) */ static inline int is_apbt_capable(void) { - return apbt_virt_address ? 1 : 0; + return apbt_virt_address ? 1 : 0; } static struct clocksource clocksource_apbt = { - .name = "apbt", - .rating = APBT_CLOCKSOURCE_RATING, - .read = apbt_read_clocksource, - .mask = APBT_MASK, - .shift = APBT_SHIFT, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, - .resume = apbt_restart_clocksource, + .name = "apbt", + .rating = APBT_CLOCKSOURCE_RATING, + .read = apbt_read_clocksource, + .mask = APBT_MASK, + .shift = APBT_SHIFT, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = apbt_restart_clocksource, }; /* boot APB clock event device */ static struct clock_event_device apbt_clockevent = { - .name = "apbt0", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = apbt_set_mode, - .set_next_event = apbt_next_event, - .shift = APBT_SHIFT, - .irq = 0, - .rating = APBT_CLOCKEVENT_RATING, + .name = "apbt0", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = apbt_set_mode, + .set_next_event = apbt_next_event, + .shift = APBT_SHIFT, + .irq = 0, + .rating = APBT_CLOCKEVENT_RATING, }; /* @@ -199,20 +199,20 @@ static struct clock_event_device apbt_clockevent = { */ static inline int __init setup_x86_mrst_timer(char *arg) { - if (!arg) - return -EINVAL; - - if (strcmp("apbt_only", arg) == 0) - disable_apbt_percpu = 0; - else if (strcmp("lapic_and_apbt", arg) == 0) - disable_apbt_percpu = 1; - else { - pr_warning("X86 MRST timer option %s not recognised" - " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", - arg); - return -EINVAL; - } - return 0; + if (!arg) + return -EINVAL; + + if (strcmp("apbt_only", arg) == 0) + disable_apbt_percpu = 0; + else if (strcmp("lapic_and_apbt", arg) == 0) + disable_apbt_percpu = 1; + else { + pr_warning("X86 MRST timer option %s not recognised" + " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", + arg); + return -EINVAL; + } + return 0; } __setup("x86_mrst_timer=", setup_x86_mrst_timer); @@ -222,176 +222,176 @@ __setup("x86_mrst_timer=", setup_x86_mrst_timer); */ static void apbt_start_counter(int n) { - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); - apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT); - /* enable, mask interrupt */ - ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; - ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT); - apbt_writel(n, ctrl, APBTMR_N_CONTROL); - /* read it once to get cached counter value initialized */ - apbt_read_clocksource(&clocksource_apbt); + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT); + /* enable, mask interrupt */ + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT); + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + /* read it once to get cached counter value initialized */ + apbt_read_clocksource(&clocksource_apbt); } static irqreturn_t apbt_interrupt_handler(int irq, void *data) { - struct apbt_dev *dev = (struct apbt_dev *)data; - struct clock_event_device *aevt = &dev->evt; - - if (!aevt->event_handler) { - printk(KERN_INFO "Spurious APBT timer interrupt on %d\n", - dev->num); - return IRQ_NONE; - } - aevt->event_handler(aevt); - return IRQ_HANDLED; + struct apbt_dev *dev = (struct apbt_dev *)data; + struct clock_event_device *aevt = &dev->evt; + + if (!aevt->event_handler) { + printk(KERN_INFO "Spurious APBT timer interrupt on %d\n", + dev->num); + return IRQ_NONE; + } + aevt->event_handler(aevt); + return IRQ_HANDLED; } static void apbt_restart_clocksource(void) { - apbt_start_counter(phy_cs_timer_id); + apbt_start_counter(phy_cs_timer_id); } /* Setup IRQ routing via IOAPIC */ #ifdef CONFIG_SMP static void apbt_setup_irq(struct apbt_dev *adev) { - struct irq_chip *chip; - struct irq_desc *desc; - - /* timer0 irq has been setup early */ - if (adev->irq == 0) - return; - desc = irq_to_desc(adev->irq); - chip = get_irq_chip(adev->irq); - disable_irq(adev->irq); - desc->status |= IRQ_MOVE_PCNTXT; - irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); - /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */ - set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge"); - enable_irq(adev->irq); - if (system_state == SYSTEM_BOOTING) - if (request_irq(adev->irq, apbt_interrupt_handler, - IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, - adev->name, adev)) { - printk(KERN_ERR "Failed request IRQ for APBT%d\n", - adev->num); - } + struct irq_chip *chip; + struct irq_desc *desc; + + /* timer0 irq has been setup early */ + if (adev->irq == 0) + return; + desc = irq_to_desc(adev->irq); + chip = get_irq_chip(adev->irq); + disable_irq(adev->irq); + desc->status |= IRQ_MOVE_PCNTXT; + irq_set_affinity(adev->irq, cpumask_of(adev->cpu)); + /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */ + set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge"); + enable_irq(adev->irq); + if (system_state == SYSTEM_BOOTING) + if (request_irq(adev->irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + adev->name, adev)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + adev->num); + } } #endif static void apbt_enable_int(int n) { - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - /* clear pending intr */ - apbt_readl(n, APBTMR_N_EOI); - ctrl &= ~APBTMR_CONTROL_INT; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + /* clear pending intr */ + apbt_readl(n, APBTMR_N_EOI); + ctrl &= ~APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); } static void apbt_disable_int(int n) { - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - ctrl |= APBTMR_CONTROL_INT; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); + ctrl |= APBTMR_CONTROL_INT; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); } static int __init apbt_clockevent_register(void) { - struct sfi_timer_table_entry *mtmr; - struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev); - - mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); - if (mtmr == NULL) { - printk(KERN_ERR "Failed to get MTMR %d from SFI\n", - APBT_CLOCKEVENT0_NUM); - return -ENODEV; - } - - /* - * We need to calculate the scaled math multiplication factor for - * nanosecond to apbt tick conversion. - * mult = (nsec/cycle)*2^APBT_SHIFT - */ - apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz - , NSEC_PER_SEC, APBT_SHIFT); - - /* Calculate the min / max delta */ - apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, - &apbt_clockevent); - apbt_clockevent.min_delta_ns = clockevent_delta2ns( - APBT_MIN_DELTA_USEC*apbt_freq, - &apbt_clockevent); - /* - * Start apbt with the boot cpu mask and make it - * global if not used for per cpu timer. - */ - apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); - adev->num = smp_processor_id(); - memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); - - if (disable_apbt_percpu) { - apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; + struct sfi_timer_table_entry *mtmr; + struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev); + + mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM); + if (mtmr == NULL) { + printk(KERN_ERR "Failed to get MTMR %d from SFI\n", + APBT_CLOCKEVENT0_NUM); + return -ENODEV; + } + + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz + , NSEC_PER_SEC, APBT_SHIFT); + + /* Calculate the min / max delta */ + apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, + &apbt_clockevent); + apbt_clockevent.min_delta_ns = clockevent_delta2ns( + APBT_MIN_DELTA_USEC*apbt_freq, + &apbt_clockevent); + /* + * Start apbt with the boot cpu mask and make it + * global if not used for per cpu timer. + */ + apbt_clockevent.cpumask = cpumask_of(smp_processor_id()); + adev->num = smp_processor_id(); + memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); + + if (disable_apbt_percpu) { + apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; global_clock_event = &adev->evt; - printk(KERN_DEBUG "%s clockevent registered as global\n", - global_clock_event->name); - } - - if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, - IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, - apbt_clockevent.name, adev)) { - printk(KERN_ERR "Failed request IRQ for APBT%d\n", - apbt_clockevent.irq); - } - - clockevents_register_device(&adev->evt); - /* Start APBT 0 interrupts */ - apbt_enable_int(APBT_CLOCKEVENT0_NUM); - - sfi_free_mtmr(mtmr); - return 0; + printk(KERN_DEBUG "%s clockevent registered as global\n", + global_clock_event->name); + } + + if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler, + IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING, + apbt_clockevent.name, adev)) { + printk(KERN_ERR "Failed request IRQ for APBT%d\n", + apbt_clockevent.irq); + } + + clockevents_register_device(&adev->evt); + /* Start APBT 0 interrupts */ + apbt_enable_int(APBT_CLOCKEVENT0_NUM); + + sfi_free_mtmr(mtmr); + return 0; } #ifdef CONFIG_SMP /* Should be called with per cpu */ void apbt_setup_secondary_clock(void) { - struct apbt_dev *adev; - struct clock_event_device *aevt; - int cpu; - - /* Don't register boot CPU clockevent */ - cpu = smp_processor_id(); - if (cpu == boot_cpu_id) - return; - /* - * We need to calculate the scaled math multiplication factor for - * nanosecond to apbt tick conversion. - * mult = (nsec/cycle)*2^APBT_SHIFT - */ - printk(KERN_INFO "Init per CPU clockevent %d\n", cpu); - adev = &per_cpu(cpu_apbt_dev, cpu); - aevt = &adev->evt; - - memcpy(aevt, &apbt_clockevent, sizeof(*aevt)); - aevt->cpumask = cpumask_of(cpu); - aevt->name = adev->name; - aevt->mode = CLOCK_EVT_MODE_UNUSED; - - printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n", - cpu, aevt->name, *(u32 *)aevt->cpumask); - - apbt_setup_irq(adev); - - clockevents_register_device(aevt); - - apbt_enable_int(cpu); - - return; + struct apbt_dev *adev; + struct clock_event_device *aevt; + int cpu; + + /* Don't register boot CPU clockevent */ + cpu = smp_processor_id(); + if (cpu == boot_cpu_id) + return; + /* + * We need to calculate the scaled math multiplication factor for + * nanosecond to apbt tick conversion. + * mult = (nsec/cycle)*2^APBT_SHIFT + */ + printk(KERN_INFO "Init per CPU clockevent %d\n", cpu); + adev = &per_cpu(cpu_apbt_dev, cpu); + aevt = &adev->evt; + + memcpy(aevt, &apbt_clockevent, sizeof(*aevt)); + aevt->cpumask = cpumask_of(cpu); + aevt->name = adev->name; + aevt->mode = CLOCK_EVT_MODE_UNUSED; + + printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n", + cpu, aevt->name, *(u32 *)aevt->cpumask); + + apbt_setup_irq(adev); + + clockevents_register_device(aevt); + + apbt_enable_int(cpu); + + return; } /* @@ -405,34 +405,34 @@ void apbt_setup_secondary_clock(void) * the extra interrupt is harmless. */ static int apbt_cpuhp_notify(struct notifier_block *n, - unsigned long action, void *hcpu) + unsigned long action, void *hcpu) { - unsigned long cpu = (unsigned long)hcpu; - struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); - - switch (action & 0xf) { - case CPU_DEAD: - apbt_disable_int(cpu); - if (system_state == SYSTEM_RUNNING) - pr_debug("skipping APBT CPU %lu offline\n", cpu); - else if (adev) { - pr_debug("APBT clockevent for cpu %lu offline\n", cpu); - free_irq(adev->irq, adev); - } - break; - default: - pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); - } - return NOTIFY_OK; + unsigned long cpu = (unsigned long)hcpu; + struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); + + switch (action & 0xf) { + case CPU_DEAD: + apbt_disable_int(cpu); + if (system_state == SYSTEM_RUNNING) + pr_debug("skipping APBT CPU %lu offline\n", cpu); + else if (adev) { + pr_debug("APBT clockevent for cpu %lu offline\n", cpu); + free_irq(adev->irq, adev); + } + break; + default: + pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); + } + return NOTIFY_OK; } static __init int apbt_late_init(void) { - if (disable_apbt_percpu) - return 0; - /* This notifier should be called after workqueue is ready */ - hotcpu_notifier(apbt_cpuhp_notify, -20); - return 0; + if (disable_apbt_percpu) + return 0; + /* This notifier should be called after workqueue is ready */ + hotcpu_notifier(apbt_cpuhp_notify, -20); + return 0; } fs_initcall(apbt_late_init); #else @@ -442,93 +442,93 @@ void apbt_setup_secondary_clock(void) {} #endif /* CONFIG_SMP */ static void apbt_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) + struct clock_event_device *evt) { - unsigned long ctrl; - uint64_t delta; - int timer_num; - struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); - - timer_num = adev->num; - pr_debug("%s CPU %d timer %d mode=%d\n", - __func__, first_cpu(*evt->cpumask), timer_num, mode); - - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult; - delta >>= apbt_clockevent.shift; - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - ctrl |= APBTMR_CONTROL_MODE_PERIODIC; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - /* - * DW APB p. 46, have to disable timer before load counter, - * may cause sync problem. - */ - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - udelay(1); - pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ); - apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); - ctrl |= APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - break; - /* APB timer does not have one-shot mode, use free running mode */ - case CLOCK_EVT_MODE_ONESHOT: - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - /* - * set free running mode, this mode will let timer reload max - * timeout which will give time (3min on 25MHz clock) to rearm - * the next event, therefore emulate the one-shot mode. - */ - ctrl &= ~APBTMR_CONTROL_ENABLE; - ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; - - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - /* write again to set free running mode */ - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - - /* - * DW APB p. 46, load counter with all 1s before starting free - * running mode. - */ - apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT); - ctrl &= ~APBTMR_CONTROL_INT; - ctrl |= APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - break; - - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - apbt_disable_int(timer_num); - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - break; - - case CLOCK_EVT_MODE_RESUME: - apbt_enable_int(timer_num); - break; - } + unsigned long ctrl; + uint64_t delta; + int timer_num; + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + pr_debug("%s CPU %d timer %d mode=%d\n", + __func__, first_cpu(*evt->cpumask), timer_num, mode); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult; + delta >>= apbt_clockevent.shift; + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl |= APBTMR_CONTROL_MODE_PERIODIC; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* + * DW APB p. 46, have to disable timer before load counter, + * may cause sync problem. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + udelay(1); + pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ); + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + /* APB timer does not have one-shot mode, use free running mode */ + case CLOCK_EVT_MODE_ONESHOT: + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + /* + * set free running mode, this mode will let timer reload max + * timeout which will give time (3min on 25MHz clock) to rearm + * the next event, therefore emulate the one-shot mode. + */ + ctrl &= ~APBTMR_CONTROL_ENABLE; + ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC; + + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write again to set free running mode */ + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + + /* + * DW APB p. 46, load counter with all 1s before starting free + * running mode. + */ + apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT); + ctrl &= ~APBTMR_CONTROL_INT; + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + apbt_disable_int(timer_num); + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + break; + + case CLOCK_EVT_MODE_RESUME: + apbt_enable_int(timer_num); + break; + } } static int apbt_next_event(unsigned long delta, - struct clock_event_device *evt) + struct clock_event_device *evt) { - unsigned long ctrl; - int timer_num; - - struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); - - timer_num = adev->num; - /* Disable timer */ - ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - /* write new count */ - apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); - ctrl |= APBTMR_CONTROL_ENABLE; - apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); - return 0; + unsigned long ctrl; + int timer_num; + + struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + + timer_num = adev->num; + /* Disable timer */ + ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + /* write new count */ + apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT); + ctrl |= APBTMR_CONTROL_ENABLE; + apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL); + return 0; } /* @@ -540,94 +540,94 @@ static int apbt_next_event(unsigned long delta, */ static cycle_t apbt_read_clocksource(struct clocksource *cs) { - unsigned long t0, t1, t2; - static unsigned long last_read; + unsigned long t0, t1, t2; + static unsigned long last_read; bad_count: - t1 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - t2 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - if (unlikely(t1 < t2)) { - pr_debug("APBT: read current count error %lx:%lx:%lx\n", - t1, t2, t2 - t1); - goto bad_count; - } - /* - * check against cached last read, makes sure time does not go back. - * it could be a normal rollover but we will do tripple check anyway - */ - if (unlikely(t2 > last_read)) { - /* check if we have a normal rollover */ - unsigned long raw_intr_status = - apbt_readl_reg(APBTMRS_RAW_INT_STATUS); - /* - * cs timer interrupt is masked but raw intr bit is set if - * rollover occurs. then we read EOI reg to clear it. - */ - if (raw_intr_status & (1 << phy_cs_timer_id)) { - apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); - goto out; - } - pr_debug("APB CS going back %lx:%lx:%lx ", - t2, last_read, t2 - last_read); + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if (unlikely(t1 < t2)) { + pr_debug("APBT: read current count error %lx:%lx:%lx\n", + t1, t2, t2 - t1); + goto bad_count; + } + /* + * check against cached last read, makes sure time does not go back. + * it could be a normal rollover but we will do tripple check anyway + */ + if (unlikely(t2 > last_read)) { + /* check if we have a normal rollover */ + unsigned long raw_intr_status = + apbt_readl_reg(APBTMRS_RAW_INT_STATUS); + /* + * cs timer interrupt is masked but raw intr bit is set if + * rollover occurs. then we read EOI reg to clear it. + */ + if (raw_intr_status & (1 << phy_cs_timer_id)) { + apbt_readl(phy_cs_timer_id, APBTMR_N_EOI); + goto out; + } + pr_debug("APB CS going back %lx:%lx:%lx ", + t2, last_read, t2 - last_read); bad_count_x3: - pr_debug(KERN_INFO "tripple check enforced\n"); - t0 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - udelay(1); - t1 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - udelay(1); - t2 = apbt_readl(phy_cs_timer_id, - APBTMR_N_CURRENT_VALUE); - if ((t2 > t1) || (t1 > t0)) { - printk(KERN_ERR "Error: APB CS tripple check failed\n"); - goto bad_count_x3; - } - } + pr_debug(KERN_INFO "tripple check enforced\n"); + t0 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t1 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + udelay(1); + t2 = apbt_readl(phy_cs_timer_id, + APBTMR_N_CURRENT_VALUE); + if ((t2 > t1) || (t1 > t0)) { + printk(KERN_ERR "Error: APB CS tripple check failed\n"); + goto bad_count_x3; + } + } out: - last_read = t2; - return (cycle_t)~t2; + last_read = t2; + return (cycle_t)~t2; } static int apbt_clocksource_register(void) { - u64 start, now; - cycle_t t1; - - /* Start the counter, use timer 2 as source, timer 0/1 for event */ - apbt_start_counter(phy_cs_timer_id); - - /* Verify whether apbt counter works */ - t1 = apbt_read_clocksource(&clocksource_apbt); - rdtscll(start); - - /* - * We don't know the TSC frequency yet, but waiting for - * 200000 TSC cycles is safe: - * 4 GHz == 50us - * 1 GHz == 200us - */ - do { - rep_nop(); - rdtscll(now); - } while ((now - start) < 200000UL); - - /* APBT is the only always on clocksource, it has to work! */ - if (t1 == apbt_read_clocksource(&clocksource_apbt)) - panic("APBT counter not counting. APBT disabled\n"); - - /* - * initialize and register APBT clocksource - * convert that to ns/clock cycle - * mult = (ns/c) * 2^APBT_SHIFT - */ - clocksource_apbt.mult = div_sc(MSEC_PER_SEC, - (unsigned long) apbt_freq, APBT_SHIFT); - clocksource_register(&clocksource_apbt); - - return 0; + u64 start, now; + cycle_t t1; + + /* Start the counter, use timer 2 as source, timer 0/1 for event */ + apbt_start_counter(phy_cs_timer_id); + + /* Verify whether apbt counter works */ + t1 = apbt_read_clocksource(&clocksource_apbt); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + rdtscll(now); + } while ((now - start) < 200000UL); + + /* APBT is the only always on clocksource, it has to work! */ + if (t1 == apbt_read_clocksource(&clocksource_apbt)) + panic("APBT counter not counting. APBT disabled\n"); + + /* + * initialize and register APBT clocksource + * convert that to ns/clock cycle + * mult = (ns/c) * 2^APBT_SHIFT + */ + clocksource_apbt.mult = div_sc(MSEC_PER_SEC, + (unsigned long) apbt_freq, APBT_SHIFT); + clocksource_register(&clocksource_apbt); + + return 0; } /* @@ -640,145 +640,145 @@ static int apbt_clocksource_register(void) void __init apbt_time_init(void) { #ifdef CONFIG_SMP - int i; - struct sfi_timer_table_entry *p_mtmr; - unsigned int percpu_timer; - struct apbt_dev *adev; + int i; + struct sfi_timer_table_entry *p_mtmr; + unsigned int percpu_timer; + struct apbt_dev *adev; #endif - if (apb_timer_block_enabled) - return; - apbt_set_mapping(); - if (apbt_virt_address) { - pr_debug("Found APBT version 0x%lx\n",\ - apbt_readl_reg(APBTMRS_COMP_VERSION)); - } else - goto out_noapbt; - /* - * Read the frequency and check for a sane value, for ESL model - * we extend the possible clock range to allow time scaling. - */ - - if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { - pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq); - goto out_noapbt; - } - if (apbt_clocksource_register()) { - pr_debug("APBT has failed to register clocksource\n"); - goto out_noapbt; - } - if (!apbt_clockevent_register()) - apb_timer_block_enabled = 1; - else { - pr_debug("APBT has failed to register clockevent\n"); - goto out_noapbt; - } + if (apb_timer_block_enabled) + return; + apbt_set_mapping(); + if (apbt_virt_address) { + pr_debug("Found APBT version 0x%lx\n",\ + apbt_readl_reg(APBTMRS_COMP_VERSION)); + } else + goto out_noapbt; + /* + * Read the frequency and check for a sane value, for ESL model + * we extend the possible clock range to allow time scaling. + */ + + if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) { + pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq); + goto out_noapbt; + } + if (apbt_clocksource_register()) { + pr_debug("APBT has failed to register clocksource\n"); + goto out_noapbt; + } + if (!apbt_clockevent_register()) + apb_timer_block_enabled = 1; + else { + pr_debug("APBT has failed to register clockevent\n"); + goto out_noapbt; + } #ifdef CONFIG_SMP - /* kernel cmdline disable apb timer, so we will use lapic timers */ - if (disable_apbt_percpu) { - printk(KERN_INFO "apbt: disabled per cpu timer\n"); - return; - } - pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); - if (num_possible_cpus() <= sfi_mtimer_num) { - percpu_timer = 1; - apbt_num_timers_used = num_possible_cpus(); - } else { - percpu_timer = 0; - apbt_num_timers_used = 1; - adev = &per_cpu(cpu_apbt_dev, 0); - adev->flags &= ~APBT_DEV_USED; - } - pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); - - /* here we set up per CPU timer data structure */ - apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used, - GFP_KERNEL); - if (!apbt_devs) { - printk(KERN_ERR "Failed to allocate APB timer devices\n"); - return; - } - for (i = 0; i < apbt_num_timers_used; i++) { - adev = &per_cpu(cpu_apbt_dev, i); - adev->num = i; - adev->cpu = i; - p_mtmr = sfi_get_mtmr(i); - if (p_mtmr) { - adev->tick = p_mtmr->freq_hz; - adev->irq = p_mtmr->irq; - } else - printk(KERN_ERR "Failed to get timer for cpu %d\n", i); - adev->count = 0; - sprintf(adev->name, "apbt%d", i); - } + /* kernel cmdline disable apb timer, so we will use lapic timers */ + if (disable_apbt_percpu) { + printk(KERN_INFO "apbt: disabled per cpu timer\n"); + return; + } + pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus()); + if (num_possible_cpus() <= sfi_mtimer_num) { + percpu_timer = 1; + apbt_num_timers_used = num_possible_cpus(); + } else { + percpu_timer = 0; + apbt_num_timers_used = 1; + adev = &per_cpu(cpu_apbt_dev, 0); + adev->flags &= ~APBT_DEV_USED; + } + pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used); + + /* here we set up per CPU timer data structure */ + apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used, + GFP_KERNEL); + if (!apbt_devs) { + printk(KERN_ERR "Failed to allocate APB timer devices\n"); + return; + } + for (i = 0; i < apbt_num_timers_used; i++) { + adev = &per_cpu(cpu_apbt_dev, i); + adev->num = i; + adev->cpu = i; + p_mtmr = sfi_get_mtmr(i); + if (p_mtmr) { + adev->tick = p_mtmr->freq_hz; + adev->irq = p_mtmr->irq; + } else + printk(KERN_ERR "Failed to get timer for cpu %d\n", i); + adev->count = 0; + sprintf(adev->name, "apbt%d", i); + } #endif - return; + return; out_noapbt: - apbt_clear_mapping(); - apb_timer_block_enabled = 0; - panic("failed to enable APB timer\n"); + apbt_clear_mapping(); + apb_timer_block_enabled = 0; + panic("failed to enable APB timer\n"); } static inline void apbt_disable(int n) { - if (is_apbt_capable()) { - unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); - ctrl &= ~APBTMR_CONTROL_ENABLE; - apbt_writel(n, ctrl, APBTMR_N_CONTROL); - } + if (is_apbt_capable()) { + unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); + ctrl &= ~APBTMR_CONTROL_ENABLE; + apbt_writel(n, ctrl, APBTMR_N_CONTROL); + } } /* called before apb_timer_enable, use early map */ unsigned long apbt_quick_calibrate() { - int i, scale; - u64 old, new; - cycle_t t1, t2; - unsigned long khz = 0; - u32 loop, shift; - - apbt_set_mapping(); - apbt_start_counter(phy_cs_timer_id); - - /* check if the timer can count down, otherwise return */ - old = apbt_read_clocksource(&clocksource_apbt); - i = 10000; - while (--i) { - if (old != apbt_read_clocksource(&clocksource_apbt)) - break; - } - if (!i) - goto failed; - - /* count 16 ms */ - loop = (apbt_freq * 1000) << 4; - - /* restart the timer to ensure it won't get to 0 in the calibration */ - apbt_start_counter(phy_cs_timer_id); - - old = apbt_read_clocksource(&clocksource_apbt); - old += loop; - - t1 = __native_read_tsc(); - - do { - new = apbt_read_clocksource(&clocksource_apbt); - } while (new < old); - - t2 = __native_read_tsc(); - - shift = 5; - if (unlikely(loop >> shift == 0)) { - printk(KERN_INFO - "APBT TSC calibration failed, not enough resolution\n"); - return 0; - } - scale = (int)div_u64((t2 - t1), loop >> shift); - khz = (scale * apbt_freq * 1000) >> shift; - printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); - return khz; + int i, scale; + u64 old, new; + cycle_t t1, t2; + unsigned long khz = 0; + u32 loop, shift; + + apbt_set_mapping(); + apbt_start_counter(phy_cs_timer_id); + + /* check if the timer can count down, otherwise return */ + old = apbt_read_clocksource(&clocksource_apbt); + i = 10000; + while (--i) { + if (old != apbt_read_clocksource(&clocksource_apbt)) + break; + } + if (!i) + goto failed; + + /* count 16 ms */ + loop = (apbt_freq * 1000) << 4; + + /* restart the timer to ensure it won't get to 0 in the calibration */ + apbt_start_counter(phy_cs_timer_id); + + old = apbt_read_clocksource(&clocksource_apbt); + old += loop; + + t1 = __native_read_tsc(); + + do { + new = apbt_read_clocksource(&clocksource_apbt); + } while (new < old); + + t2 = __native_read_tsc(); + + shift = 5; + if (unlikely(loop >> shift == 0)) { + printk(KERN_INFO + "APBT TSC calibration failed, not enough resolution\n"); + return 0; + } + scale = (int)div_u64((t2 - t1), loop >> shift); + khz = (scale * apbt_freq * 1000) >> shift; + printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); + return khz; failed: - return 0; + return 0; } -- cgit v1.2.3-70-g09d2 From e5a11016643d1ab7172193591506d33a844734cc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 3 Mar 2010 22:38:50 -0500 Subject: x86: Issue at least one memory barrier in stop_machine_text_poke() Fix stop_machine_text_poke() to issue smp_mb() before exiting waiting loop, and use cpu_relax() for waiting. Changes in v2: - Don't use ACCESS_ONCE(). Signed-off-by: Masami Hiramatsu Acked-by: Mathieu Desnoyers Cc: systemtap Cc: DLE Cc: Jason Baron LKML-Reference: <20100304033850.3819.74590.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c41f13c15e8..e0b87709947 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -595,8 +595,8 @@ static int __kprobes stop_machine_text_poke(void *data) wrote_text = 1; } else { while (!wrote_text) - smp_rmb(); - sync_core(); + cpu_relax(); + smp_mb(); /* Load wrote_text before following execution */ } flush_icache_range((unsigned long)tpp->addr, -- cgit v1.2.3-70-g09d2 From 6c550ee41596798cbd873d3df9f8ea0a4ce7ad2f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 5 Mar 2010 09:52:52 -0800 Subject: x86: fix mtrr missing kernel-doc Fix missing kernel-doc notation in mtrr/main.c: Warning(arch/x86/kernel/cpu/mtrr/main.c:152): No description found for parameter 'info' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mtrr/main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index fe4622e8c83..79556bd9b60 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -145,6 +145,7 @@ struct set_mtrr_data { /** * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * @info: pointer to mtrr configuration data * * Returns nothing. */ -- cgit v1.2.3-70-g09d2 From 984b3f5746ed2cde3d184651dabf26980f2b66e5 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 5 Mar 2010 13:41:37 -0800 Subject: bitops: rename for_each_bit() to for_each_set_bit() Rename for_each_bit to for_each_set_bit in the kernel source tree. To permit for_each_clear_bit(), should that ever be added. The patch includes a macro to map the old for_each_bit() onto the new for_each_set_bit(). This is a (very) temporary thing to ease the migration. [akpm@linux-foundation.org: add temporary for_each_bit()] Suggested-by: Alexey Dobriyan Suggested-by: Andrew Morton Signed-off-by: Akinobu Mita Cc: "David S. Miller" Cc: Russell King Cc: David Woodhouse Cc: Artem Bityutskiy Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- drivers/dma/ioat/dma.c | 2 +- drivers/gpio/pl061.c | 2 +- drivers/gpio/timbgpio.c | 2 +- drivers/i2c/busses/i2c-designware.c | 4 ++-- drivers/mfd/htc-egpio.c | 2 +- drivers/misc/sgi-xp/xpnet.c | 2 +- drivers/net/gianfar.c | 12 ++++++------ drivers/net/ixgbe/ixgbe_main.c | 2 +- drivers/net/ixgbevf/ixgbevf_main.c | 2 +- drivers/net/wireless/ath/ar9170/main.c | 2 +- drivers/net/wireless/iwmc3200wifi/debugfs.c | 2 +- drivers/net/wireless/iwmc3200wifi/rx.c | 2 +- fs/ocfs2/quota_local.c | 2 +- include/linux/bitops.h | 4 +++- kernel/sched_cpupri.c | 2 +- sound/soc/codecs/uda1380.c | 2 +- 18 files changed, 26 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 641ccb9dddb..b1fbdeecf6c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -676,7 +676,7 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (c->weight != w) continue; - for_each_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { + for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { if (!test_bit(j, used_mask)) break; } diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index cf6590cf4a5..977e7544738 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -757,7 +757,7 @@ again: inc_irq_stat(apic_perf_irqs); ack = status; - for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; clear_bit(bit, (unsigned long *) &status); diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c index 5d0e42b263d..af14c9a5b8d 100644 --- a/drivers/dma/ioat/dma.c +++ b/drivers/dma/ioat/dma.c @@ -71,7 +71,7 @@ static irqreturn_t ioat_dma_do_interrupt(int irq, void *data) } attnstatus = readl(instance->reg_base + IOAT_ATTNSTATUS_OFFSET); - for_each_bit(bit, &attnstatus, BITS_PER_LONG) { + for_each_set_bit(bit, &attnstatus, BITS_PER_LONG) { chan = ioat_chan_by_index(instance, bit); tasklet_schedule(&chan->cleanup_task); } diff --git a/drivers/gpio/pl061.c b/drivers/gpio/pl061.c index 4ee4c8367a3..3ad1eeb4960 100644 --- a/drivers/gpio/pl061.c +++ b/drivers/gpio/pl061.c @@ -219,7 +219,7 @@ static void pl061_irq_handler(unsigned irq, struct irq_desc *desc) if (pending == 0) continue; - for_each_bit(offset, &pending, PL061_GPIO_NR) + for_each_set_bit(offset, &pending, PL061_GPIO_NR) generic_handle_irq(pl061_to_irq(&chip->gc, offset)); } desc->chip->unmask(irq); diff --git a/drivers/gpio/timbgpio.c b/drivers/gpio/timbgpio.c index d941f45fe55..4ecba6e5a32 100644 --- a/drivers/gpio/timbgpio.c +++ b/drivers/gpio/timbgpio.c @@ -175,7 +175,7 @@ static void timbgpio_irq(unsigned int irq, struct irq_desc *desc) ipr = ioread32(tgpio->membase + TGPIO_IPR); iowrite32(ipr, tgpio->membase + TGPIO_ICR); - for_each_bit(offset, &ipr, tgpio->gpio.ngpio) + for_each_set_bit(offset, &ipr, tgpio->gpio.ngpio) generic_handle_irq(timbgpio_to_irq(&tgpio->gpio, offset)); } diff --git a/drivers/i2c/busses/i2c-designware.c b/drivers/i2c/busses/i2c-designware.c index 9e18ef97f15..3e72b69aa7f 100644 --- a/drivers/i2c/busses/i2c-designware.c +++ b/drivers/i2c/busses/i2c-designware.c @@ -497,13 +497,13 @@ static int i2c_dw_handle_tx_abort(struct dw_i2c_dev *dev) int i; if (abort_source & DW_IC_TX_ABRT_NOACK) { - for_each_bit(i, &abort_source, ARRAY_SIZE(abort_sources)) + for_each_set_bit(i, &abort_source, ARRAY_SIZE(abort_sources)) dev_dbg(dev->dev, "%s: %s\n", __func__, abort_sources[i]); return -EREMOTEIO; } - for_each_bit(i, &abort_source, ARRAY_SIZE(abort_sources)) + for_each_set_bit(i, &abort_source, ARRAY_SIZE(abort_sources)) dev_err(dev->dev, "%s: %s\n", __func__, abort_sources[i]); if (abort_source & DW_IC_TX_ARB_LOST) diff --git a/drivers/mfd/htc-egpio.c b/drivers/mfd/htc-egpio.c index aa266e1f69b..addb846c1e3 100644 --- a/drivers/mfd/htc-egpio.c +++ b/drivers/mfd/htc-egpio.c @@ -108,7 +108,7 @@ static void egpio_handler(unsigned int irq, struct irq_desc *desc) ack_irqs(ei); /* Process all set pins. */ readval &= ei->irqs_enabled; - for_each_bit(irqpin, &readval, ei->nirqs) { + for_each_set_bit(irqpin, &readval, ei->nirqs) { /* Run irq handler */ pr_debug("got IRQ %d\n", irqpin); irq = ei->irq_start + irqpin; diff --git a/drivers/misc/sgi-xp/xpnet.c b/drivers/misc/sgi-xp/xpnet.c index 16f0abda142..57b152f8d1b 100644 --- a/drivers/misc/sgi-xp/xpnet.c +++ b/drivers/misc/sgi-xp/xpnet.c @@ -475,7 +475,7 @@ xpnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) if (skb->data[0] == 0xff) { /* we are being asked to broadcast to all partitions */ - for_each_bit(dest_partid, xpnet_broadcast_partitions, + for_each_set_bit(dest_partid, xpnet_broadcast_partitions, xp_max_npartitions) { xpnet_send(skb, queued_msg, start_addr, end_addr, diff --git a/drivers/net/gianfar.c b/drivers/net/gianfar.c index 6aa526ee909..61a7b4351e7 100644 --- a/drivers/net/gianfar.c +++ b/drivers/net/gianfar.c @@ -998,7 +998,7 @@ static int gfar_probe(struct of_device *ofdev, } /* Need to reverse the bit maps as bit_map's MSB is q0 - * but, for_each_bit parses from right to left, which + * but, for_each_set_bit parses from right to left, which * basically reverses the queue numbers */ for (i = 0; i< priv->num_grps; i++) { priv->gfargrp[i].tx_bit_map = reverse_bitmap( @@ -1011,7 +1011,7 @@ static int gfar_probe(struct of_device *ofdev, * also assign queues to groups */ for (grp_idx = 0; grp_idx < priv->num_grps; grp_idx++) { priv->gfargrp[grp_idx].num_rx_queues = 0x0; - for_each_bit(i, &priv->gfargrp[grp_idx].rx_bit_map, + for_each_set_bit(i, &priv->gfargrp[grp_idx].rx_bit_map, priv->num_rx_queues) { priv->gfargrp[grp_idx].num_rx_queues++; priv->rx_queue[i]->grp = &priv->gfargrp[grp_idx]; @@ -1019,7 +1019,7 @@ static int gfar_probe(struct of_device *ofdev, rqueue = rqueue | ((RQUEUE_EN0 | RQUEUE_EX0) >> i); } priv->gfargrp[grp_idx].num_tx_queues = 0x0; - for_each_bit (i, &priv->gfargrp[grp_idx].tx_bit_map, + for_each_set_bit(i, &priv->gfargrp[grp_idx].tx_bit_map, priv->num_tx_queues) { priv->gfargrp[grp_idx].num_tx_queues++; priv->tx_queue[i]->grp = &priv->gfargrp[grp_idx]; @@ -1709,7 +1709,7 @@ void gfar_configure_coalescing(struct gfar_private *priv, if (priv->mode == MQ_MG_MODE) { baddr = ®s->txic0; - for_each_bit (i, &tx_mask, priv->num_tx_queues) { + for_each_set_bit(i, &tx_mask, priv->num_tx_queues) { if (likely(priv->tx_queue[i]->txcoalescing)) { gfar_write(baddr + i, 0); gfar_write(baddr + i, priv->tx_queue[i]->txic); @@ -1717,7 +1717,7 @@ void gfar_configure_coalescing(struct gfar_private *priv, } baddr = ®s->rxic0; - for_each_bit (i, &rx_mask, priv->num_rx_queues) { + for_each_set_bit(i, &rx_mask, priv->num_rx_queues) { if (likely(priv->rx_queue[i]->rxcoalescing)) { gfar_write(baddr + i, 0); gfar_write(baddr + i, priv->rx_queue[i]->rxic); @@ -2607,7 +2607,7 @@ static int gfar_poll(struct napi_struct *napi, int budget) budget_per_queue = left_over_budget/num_queues; left_over_budget = 0; - for_each_bit(i, &gfargrp->rx_bit_map, priv->num_rx_queues) { + for_each_set_bit(i, &gfargrp->rx_bit_map, priv->num_rx_queues) { if (test_bit(i, &serviced_queues)) continue; rx_queue = priv->rx_queue[i]; diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c index 45e3532b166..684af371462 100644 --- a/drivers/net/ixgbe/ixgbe_main.c +++ b/drivers/net/ixgbe/ixgbe_main.c @@ -1050,7 +1050,7 @@ static void ixgbe_configure_msix(struct ixgbe_adapter *adapter) */ for (v_idx = 0; v_idx < q_vectors; v_idx++) { q_vector = adapter->q_vector[v_idx]; - /* XXX for_each_bit(...) */ + /* XXX for_each_set_bit(...) */ r_idx = find_first_bit(q_vector->rxr_idx, adapter->num_rx_queues); diff --git a/drivers/net/ixgbevf/ixgbevf_main.c b/drivers/net/ixgbevf/ixgbevf_main.c index 235b5fd4b8d..ca653c49b76 100644 --- a/drivers/net/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ixgbevf/ixgbevf_main.c @@ -751,7 +751,7 @@ static void ixgbevf_configure_msix(struct ixgbevf_adapter *adapter) */ for (v_idx = 0; v_idx < q_vectors; v_idx++) { q_vector = adapter->q_vector[v_idx]; - /* XXX for_each_bit(...) */ + /* XXX for_each_set_bit(...) */ r_idx = find_first_bit(q_vector->rxr_idx, adapter->num_rx_queues); diff --git a/drivers/net/wireless/ath/ar9170/main.c b/drivers/net/wireless/ath/ar9170/main.c index 8a964f13036..a6452af9c6c 100644 --- a/drivers/net/wireless/ath/ar9170/main.c +++ b/drivers/net/wireless/ath/ar9170/main.c @@ -394,7 +394,7 @@ static void ar9170_tx_fake_ampdu_status(struct ar9170 *ar) ieee80211_tx_status_irqsafe(ar->hw, skb); } - for_each_bit(i, &queue_bitmap, BITS_PER_BYTE) { + for_each_set_bit(i, &queue_bitmap, BITS_PER_BYTE) { #ifdef AR9170_QUEUE_STOP_DEBUG printk(KERN_DEBUG "%s: wake queue %d\n", wiphy_name(ar->hw->wiphy), i); diff --git a/drivers/net/wireless/iwmc3200wifi/debugfs.c b/drivers/net/wireless/iwmc3200wifi/debugfs.c index be992ca41cf..c29c994de0e 100644 --- a/drivers/net/wireless/iwmc3200wifi/debugfs.c +++ b/drivers/net/wireless/iwmc3200wifi/debugfs.c @@ -89,7 +89,7 @@ static int iwm_debugfs_dbg_modules_write(void *data, u64 val) for (i = 0; i < __IWM_DM_NR; i++) iwm->dbg.dbg_module[i] = 0; - for_each_bit(bit, &iwm->dbg.dbg_modules, __IWM_DM_NR) + for_each_set_bit(bit, &iwm->dbg.dbg_modules, __IWM_DM_NR) iwm->dbg.dbg_module[bit] = iwm->dbg.dbg_level; return 0; diff --git a/drivers/net/wireless/iwmc3200wifi/rx.c b/drivers/net/wireless/iwmc3200wifi/rx.c index ad8f7eabb5a..8456b4dbd14 100644 --- a/drivers/net/wireless/iwmc3200wifi/rx.c +++ b/drivers/net/wireless/iwmc3200wifi/rx.c @@ -1116,7 +1116,7 @@ static int iwm_ntf_stop_resume_tx(struct iwm_priv *iwm, u8 *buf, return -EINVAL; } - for_each_bit(bit, (unsigned long *)&tid_msk, IWM_UMAC_TID_NR) { + for_each_set_bit(bit, (unsigned long *)&tid_msk, IWM_UMAC_TID_NR) { tid_info = &sta_info->tid_info[bit]; mutex_lock(&tid_info->mutex); diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 21f9e71223c..a6467f3d262 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -457,7 +457,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, break; } dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; - for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) { + for_each_set_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) { qbh = NULL; status = ocfs2_read_quota_block(lqinode, ol_dqblk_block(sb, chunk, bit), diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 25b8b2f33ae..b7938987923 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -16,11 +16,13 @@ */ #include -#define for_each_bit(bit, addr, size) \ +#define for_each_set_bit(bit, addr, size) \ for ((bit) = find_first_bit((addr), (size)); \ (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1)) +/* Temporary */ +#define for_each_bit(bit, addr, size) for_each_set_bit(bit, addr, size) static __inline__ int get_bitmask_order(unsigned int count) { diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index eeb3506c483..82095bf2099 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -47,7 +47,7 @@ static int convert_prio(int prio) } #define for_each_cpupri_active(array, idx) \ - for_each_bit(idx, array, CPUPRI_NR_PRIORITIES) + for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES) /** * cpupri_find - find the best (lowest-pri) CPU in the system diff --git a/sound/soc/codecs/uda1380.c b/sound/soc/codecs/uda1380.c index a2763c2e734..9cd0a66b766 100644 --- a/sound/soc/codecs/uda1380.c +++ b/sound/soc/codecs/uda1380.c @@ -137,7 +137,7 @@ static void uda1380_flush_work(struct work_struct *work) { int bit, reg; - for_each_bit(bit, &uda1380_cache_dirty, UDA1380_CACHEREGNUM - 0x10) { + for_each_set_bit(bit, &uda1380_cache_dirty, UDA1380_CACHEREGNUM - 0x10) { reg = 0x10 + bit; pr_debug("uda1380: flush reg %x val %x:\n", reg, uda1380_read_reg_cache(uda1380_codec, reg)); -- cgit v1.2.3-70-g09d2 From 52cf25d0ab7f78eeecc59ac652ed5090f69b619e Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Tue, 19 Jan 2010 02:58:23 +0100 Subject: Driver core: Constify struct sysfs_ops in struct kobj_type Constify struct sysfs_ops. This is part of the ops structure constification effort started by Arjan van de Ven et al. Benefits of this constification: * prevents modification of data that is shared (referenced) by many other structure instances at runtime * detects/prevents accidental (but not intentional) modification attempts on archs that enforce read-only kernel data at runtime * potentially better optimized code as the compiler can assume that the const data cannot be changed * the compiler/linker move const data into .rodata and therefore exclude them from false sharing Signed-off-by: Emese Revfy Acked-by: David Teigland Acked-by: Matt Domsch Acked-by: Maciej Sosnowski Acked-by: Hans J. Koch Acked-by: Pekka Enberg Acked-by: Jens Axboe Acked-by: Stephen Hemminger Signed-off-by: Greg Kroah-Hartman --- Documentation/kobject.txt | 2 +- arch/ia64/kernel/topology.c | 2 +- arch/powerpc/kernel/cacheinfo.c | 2 +- arch/sh/kernel/cpu/sh4/sq.c | 2 +- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +- block/blk-integrity.c | 2 +- block/blk-sysfs.c | 2 +- block/elevator.c | 2 +- drivers/base/bus.c | 4 ++-- drivers/base/class.c | 2 +- drivers/base/core.c | 2 +- drivers/base/sys.c | 4 ++-- drivers/block/pktcdvd.c | 2 +- drivers/cpufreq/cpufreq.c | 2 +- drivers/cpuidle/sysfs.c | 4 ++-- drivers/dma/ioat/dma.c | 2 +- drivers/dma/ioat/dma.h | 2 +- drivers/edac/edac_device_sysfs.c | 6 +++--- drivers/edac/edac_mc_sysfs.c | 4 ++-- drivers/edac/edac_pci_sysfs.c | 4 ++-- drivers/firmware/edd.c | 2 +- drivers/firmware/efivars.c | 2 +- drivers/firmware/iscsi_ibft.c | 2 +- drivers/firmware/memmap.c | 2 +- drivers/gpu/drm/ttm/ttm_bo.c | 2 +- drivers/gpu/drm/ttm/ttm_memory.c | 2 +- drivers/infiniband/core/cm.c | 2 +- drivers/infiniband/core/sysfs.c | 2 +- drivers/md/dm-sysfs.c | 2 +- drivers/md/md.c | 4 ++-- drivers/net/ibmveth.c | 2 +- drivers/net/iseries_veth.c | 4 ++-- drivers/parisc/pdc_stable.c | 2 +- drivers/pci/hotplug/fakephp.c | 2 +- drivers/pci/slot.c | 2 +- drivers/uio/uio.c | 4 ++-- drivers/uwb/wlp/sysfs.c | 3 +-- drivers/video/omap2/dss/manager.c | 2 +- drivers/video/omap2/dss/overlay.c | 2 +- drivers/xen/sys-hypervisor.c | 2 +- fs/btrfs/sysfs.c | 4 ++-- fs/dlm/lockspace.c | 2 +- fs/ext4/super.c | 2 +- fs/gfs2/sys.c | 2 +- fs/ocfs2/cluster/masklog.c | 2 +- fs/sysfs/file.c | 8 ++++---- include/linux/kobject.h | 4 ++-- kernel/params.c | 2 +- lib/kobject.c | 2 +- mm/slub.c | 2 +- net/bridge/br_private.h | 2 +- net/bridge/br_sysfs_if.c | 2 +- samples/kobject/kset-example.c | 2 +- 54 files changed, 69 insertions(+), 70 deletions(-) (limited to 'arch/x86/kernel') diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt index c79ab996dad..bdb13817e1e 100644 --- a/Documentation/kobject.txt +++ b/Documentation/kobject.txt @@ -266,7 +266,7 @@ kobj_type: struct kobj_type { void (*release)(struct kobject *); - struct sysfs_ops *sysfs_ops; + const struct sysfs_ops *sysfs_ops; struct attribute **default_attrs; }; diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index 8f060352e12..b3a5818088d 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -282,7 +282,7 @@ static ssize_t cache_show(struct kobject * kobj, struct attribute * attr, char * return ret; } -static struct sysfs_ops cache_sysfs_ops = { +static const struct sysfs_ops cache_sysfs_ops = { .show = cache_show }; diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index bb37b1d19a5..01fe9ce2837 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -642,7 +642,7 @@ static struct kobj_attribute *cache_index_opt_attrs[] = { &cache_assoc_attr, }; -static struct sysfs_ops cache_index_ops = { +static const struct sysfs_ops cache_index_ops = { .show = cache_index_show, }; diff --git a/arch/sh/kernel/cpu/sh4/sq.c b/arch/sh/kernel/cpu/sh4/sq.c index fc065f9da6e..14726eef1ce 100644 --- a/arch/sh/kernel/cpu/sh4/sq.c +++ b/arch/sh/kernel/cpu/sh4/sq.c @@ -326,7 +326,7 @@ static struct attribute *sq_sysfs_attrs[] = { NULL, }; -static struct sysfs_ops sq_sysfs_ops = { +static const struct sysfs_ops sq_sysfs_ops = { .show = sq_sysfs_show, .store = sq_sysfs_store, }; diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index eddb1bdd1b8..b3eeb66c0a5 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -903,7 +903,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, return ret; } -static struct sysfs_ops sysfs_ops = { +static const struct sysfs_ops sysfs_ops = { .show = show, .store = store, }; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 83a3d1f4efc..cda932ca3ad 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -388,7 +388,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr, return ret; } -static struct sysfs_ops threshold_ops = { +static const struct sysfs_ops threshold_ops = { .show = show, .store = store, }; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 15c630813b1..96e83c2bdb9 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -278,7 +278,7 @@ static struct attribute *integrity_attrs[] = { NULL, }; -static struct sysfs_ops integrity_ops = { +static const struct sysfs_ops integrity_ops = { .show = &integrity_attr_show, .store = &integrity_attr_store, }; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e85442415db..2ae2cb3f362 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -450,7 +450,7 @@ static void blk_release_queue(struct kobject *kobj) kmem_cache_free(blk_requestq_cachep, q); } -static struct sysfs_ops queue_sysfs_ops = { +static const struct sysfs_ops queue_sysfs_ops = { .show = queue_attr_show, .store = queue_attr_store, }; diff --git a/block/elevator.c b/block/elevator.c index ee3a883840f..df75676f667 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -892,7 +892,7 @@ elv_attr_store(struct kobject *kobj, struct attribute *attr, return error; } -static struct sysfs_ops elv_sysfs_ops = { +static const struct sysfs_ops elv_sysfs_ops = { .show = elv_attr_show, .store = elv_attr_store, }; diff --git a/drivers/base/bus.c b/drivers/base/bus.c index 2afe599eb35..cca1aa10054 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -70,7 +70,7 @@ static ssize_t drv_attr_store(struct kobject *kobj, struct attribute *attr, return ret; } -static struct sysfs_ops driver_sysfs_ops = { +static const struct sysfs_ops driver_sysfs_ops = { .show = drv_attr_show, .store = drv_attr_store, }; @@ -115,7 +115,7 @@ static ssize_t bus_attr_store(struct kobject *kobj, struct attribute *attr, return ret; } -static struct sysfs_ops bus_sysfs_ops = { +static const struct sysfs_ops bus_sysfs_ops = { .show = bus_attr_show, .store = bus_attr_store, }; diff --git a/drivers/base/class.c b/drivers/base/class.c index 2e297cc4cd3..0147f476b8a 100644 --- a/drivers/base/class.c +++ b/drivers/base/class.c @@ -63,7 +63,7 @@ static void class_release(struct kobject *kobj) kfree(cp); } -static struct sysfs_ops class_sysfs_ops = { +static const struct sysfs_ops class_sysfs_ops = { .show = class_attr_show, .store = class_attr_store, }; diff --git a/drivers/base/core.c b/drivers/base/core.c index 58ec1069f4b..b0d6646a281 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -100,7 +100,7 @@ static ssize_t dev_attr_store(struct kobject *kobj, struct attribute *attr, return ret; } -static struct sysfs_ops dev_sysfs_ops = { +static const struct sysfs_ops dev_sysfs_ops = { .show = dev_attr_show, .store = dev_attr_store, }; diff --git a/drivers/base/sys.c b/drivers/base/sys.c index 747c99e0568..8980feec5d1 100644 --- a/drivers/base/sys.c +++ b/drivers/base/sys.c @@ -54,7 +54,7 @@ sysdev_store(struct kobject *kobj, struct attribute *attr, return -EIO; } -static struct sysfs_ops sysfs_ops = { +static const struct sysfs_ops sysfs_ops = { .show = sysdev_show, .store = sysdev_store, }; @@ -104,7 +104,7 @@ static ssize_t sysdev_class_store(struct kobject *kobj, struct attribute *attr, return -EIO; } -static struct sysfs_ops sysfs_class_ops = { +static const struct sysfs_ops sysfs_class_ops = { .show = sysdev_class_show, .store = sysdev_class_store, }; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 73d815d3f1b..39c8514442e 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -284,7 +284,7 @@ static ssize_t kobj_pkt_store(struct kobject *kobj, return len; } -static struct sysfs_ops kobj_pkt_ops = { +static const struct sysfs_ops kobj_pkt_ops = { .show = kobj_pkt_show, .store = kobj_pkt_store }; diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 67bc2ece7b4..2d5d575e889 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -766,7 +766,7 @@ static void cpufreq_sysfs_release(struct kobject *kobj) complete(&policy->kobj_unregister); } -static struct sysfs_ops sysfs_ops = { +static const struct sysfs_ops sysfs_ops = { .show = show, .store = store, }; diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c index c9cefacabf3..8719b36e1a4 100644 --- a/drivers/cpuidle/sysfs.c +++ b/drivers/cpuidle/sysfs.c @@ -195,7 +195,7 @@ static ssize_t cpuidle_store(struct kobject * kobj, struct attribute * attr, return ret; } -static struct sysfs_ops cpuidle_sysfs_ops = { +static const struct sysfs_ops cpuidle_sysfs_ops = { .show = cpuidle_show, .store = cpuidle_store, }; @@ -281,7 +281,7 @@ static ssize_t cpuidle_state_show(struct kobject * kobj, return ret; } -static struct sysfs_ops cpuidle_state_sysfs_ops = { +static const struct sysfs_ops cpuidle_state_sysfs_ops = { .show = cpuidle_state_show, }; diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c index af14c9a5b8d..0099340b961 100644 --- a/drivers/dma/ioat/dma.c +++ b/drivers/dma/ioat/dma.c @@ -1138,7 +1138,7 @@ ioat_attr_show(struct kobject *kobj, struct attribute *attr, char *page) return entry->show(&chan->common, page); } -struct sysfs_ops ioat_sysfs_ops = { +const struct sysfs_ops ioat_sysfs_ops = { .show = ioat_attr_show, }; diff --git a/drivers/dma/ioat/dma.h b/drivers/dma/ioat/dma.h index 4f747a25407..86b97ac8774 100644 --- a/drivers/dma/ioat/dma.h +++ b/drivers/dma/ioat/dma.h @@ -346,7 +346,7 @@ bool ioat_cleanup_preamble(struct ioat_chan_common *chan, unsigned long *phys_complete); void ioat_kobject_add(struct ioatdma_device *device, struct kobj_type *type); void ioat_kobject_del(struct ioatdma_device *device); -extern struct sysfs_ops ioat_sysfs_ops; +extern const struct sysfs_ops ioat_sysfs_ops; extern struct ioat_sysfs_entry ioat_version_attr; extern struct ioat_sysfs_entry ioat_cap_attr; #endif /* IOATDMA_H */ diff --git a/drivers/edac/edac_device_sysfs.c b/drivers/edac/edac_device_sysfs.c index 53764577035..5fdedbc0f54 100644 --- a/drivers/edac/edac_device_sysfs.c +++ b/drivers/edac/edac_device_sysfs.c @@ -137,7 +137,7 @@ static ssize_t edac_dev_ctl_info_store(struct kobject *kobj, } /* edac_dev file operations for an 'ctl_info' */ -static struct sysfs_ops device_ctl_info_ops = { +static const struct sysfs_ops device_ctl_info_ops = { .show = edac_dev_ctl_info_show, .store = edac_dev_ctl_info_store }; @@ -373,7 +373,7 @@ static ssize_t edac_dev_instance_store(struct kobject *kobj, } /* edac_dev file operations for an 'instance' */ -static struct sysfs_ops device_instance_ops = { +static const struct sysfs_ops device_instance_ops = { .show = edac_dev_instance_show, .store = edac_dev_instance_store }; @@ -476,7 +476,7 @@ static ssize_t edac_dev_block_store(struct kobject *kobj, } /* edac_dev file operations for a 'block' */ -static struct sysfs_ops device_block_ops = { +static const struct sysfs_ops device_block_ops = { .show = edac_dev_block_show, .store = edac_dev_block_store }; diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c index e1d4ce08348..88840e9fa3e 100644 --- a/drivers/edac/edac_mc_sysfs.c +++ b/drivers/edac/edac_mc_sysfs.c @@ -245,7 +245,7 @@ static ssize_t csrowdev_store(struct kobject *kobj, struct attribute *attr, return -EIO; } -static struct sysfs_ops csrowfs_ops = { +static const struct sysfs_ops csrowfs_ops = { .show = csrowdev_show, .store = csrowdev_store }; @@ -575,7 +575,7 @@ static ssize_t mcidev_store(struct kobject *kobj, struct attribute *attr, } /* Intermediate show/store table */ -static struct sysfs_ops mci_ops = { +static const struct sysfs_ops mci_ops = { .show = mcidev_show, .store = mcidev_store }; diff --git a/drivers/edac/edac_pci_sysfs.c b/drivers/edac/edac_pci_sysfs.c index fb60a877d76..bef94e3d994 100644 --- a/drivers/edac/edac_pci_sysfs.c +++ b/drivers/edac/edac_pci_sysfs.c @@ -121,7 +121,7 @@ static ssize_t edac_pci_instance_store(struct kobject *kobj, } /* fs_ops table */ -static struct sysfs_ops pci_instance_ops = { +static const struct sysfs_ops pci_instance_ops = { .show = edac_pci_instance_show, .store = edac_pci_instance_store }; @@ -261,7 +261,7 @@ static ssize_t edac_pci_dev_store(struct kobject *kobj, return -EIO; } -static struct sysfs_ops edac_pci_sysfs_ops = { +static const struct sysfs_ops edac_pci_sysfs_ops = { .show = edac_pci_dev_show, .store = edac_pci_dev_store }; diff --git a/drivers/firmware/edd.c b/drivers/firmware/edd.c index 9e4f59dc7f1..110e24e5088 100644 --- a/drivers/firmware/edd.c +++ b/drivers/firmware/edd.c @@ -122,7 +122,7 @@ edd_attr_show(struct kobject * kobj, struct attribute *attr, char *buf) return ret; } -static struct sysfs_ops edd_attr_ops = { +static const struct sysfs_ops edd_attr_ops = { .show = edd_attr_show, }; diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c index f4f709d1370..082f06ecd32 100644 --- a/drivers/firmware/efivars.c +++ b/drivers/firmware/efivars.c @@ -362,7 +362,7 @@ static ssize_t efivar_attr_store(struct kobject *kobj, struct attribute *attr, return ret; } -static struct sysfs_ops efivar_attr_ops = { +static const struct sysfs_ops efivar_attr_ops = { .show = efivar_attr_show, .store = efivar_attr_store, }; diff --git a/drivers/firmware/iscsi_ibft.c b/drivers/firmware/iscsi_ibft.c index a3600e3ed0f..ed2801c378d 100644 --- a/drivers/firmware/iscsi_ibft.c +++ b/drivers/firmware/iscsi_ibft.c @@ -519,7 +519,7 @@ static ssize_t ibft_show_attribute(struct kobject *kobj, return ret; } -static struct sysfs_ops ibft_attr_ops = { +static const struct sysfs_ops ibft_attr_ops = { .show = ibft_show_attribute, }; diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index 20f645743ea..d59f7cad226 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -74,7 +74,7 @@ static struct attribute *def_attrs[] = { NULL }; -static struct sysfs_ops memmap_attr_ops = { +static const struct sysfs_ops memmap_attr_ops = { .show = memmap_attr_show, }; diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index c7320ce4567..89c38c49066 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -128,7 +128,7 @@ static struct attribute *ttm_bo_global_attrs[] = { NULL }; -static struct sysfs_ops ttm_bo_global_ops = { +static const struct sysfs_ops ttm_bo_global_ops = { .show = &ttm_bo_global_show }; diff --git a/drivers/gpu/drm/ttm/ttm_memory.c b/drivers/gpu/drm/ttm/ttm_memory.c index f5245c02b8f..eb143e04d40 100644 --- a/drivers/gpu/drm/ttm/ttm_memory.c +++ b/drivers/gpu/drm/ttm/ttm_memory.c @@ -152,7 +152,7 @@ static struct attribute *ttm_mem_zone_attrs[] = { NULL }; -static struct sysfs_ops ttm_mem_zone_ops = { +static const struct sysfs_ops ttm_mem_zone_ops = { .show = &ttm_mem_zone_show, .store = &ttm_mem_zone_store }; diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 5130fc55b8e..764787ebe8d 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3597,7 +3597,7 @@ static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr, atomic_long_read(&group->counter[cm_attr->index])); } -static struct sysfs_ops cm_counter_ops = { +static const struct sysfs_ops cm_counter_ops = { .show = cm_show_counter }; diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 158a214da2f..1558bb7fc74 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -79,7 +79,7 @@ static ssize_t port_attr_show(struct kobject *kobj, return port_attr->show(p, port_attr, buf); } -static struct sysfs_ops port_sysfs_ops = { +static const struct sysfs_ops port_sysfs_ops = { .show = port_attr_show }; diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c index f91b40942e0..84d2b91e4ef 100644 --- a/drivers/md/dm-sysfs.c +++ b/drivers/md/dm-sysfs.c @@ -75,7 +75,7 @@ static struct attribute *dm_attrs[] = { NULL, }; -static struct sysfs_ops dm_sysfs_ops = { +static const struct sysfs_ops dm_sysfs_ops = { .show = dm_attr_show, }; diff --git a/drivers/md/md.c b/drivers/md/md.c index a20a71e5efd..fdc1890b6ac 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2642,7 +2642,7 @@ static void rdev_free(struct kobject *ko) mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); kfree(rdev); } -static struct sysfs_ops rdev_sysfs_ops = { +static const struct sysfs_ops rdev_sysfs_ops = { .show = rdev_attr_show, .store = rdev_attr_store, }; @@ -4059,7 +4059,7 @@ static void md_free(struct kobject *ko) kfree(mddev); } -static struct sysfs_ops md_sysfs_ops = { +static const struct sysfs_ops md_sysfs_ops = { .show = md_attr_show, .store = md_attr_store, }; diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c index f2b93796695..0bc777bac9b 100644 --- a/drivers/net/ibmveth.c +++ b/drivers/net/ibmveth.c @@ -1577,7 +1577,7 @@ static struct attribute * veth_pool_attrs[] = { NULL, }; -static struct sysfs_ops veth_pool_ops = { +static const struct sysfs_ops veth_pool_ops = { .show = veth_pool_show, .store = veth_pool_store, }; diff --git a/drivers/net/iseries_veth.c b/drivers/net/iseries_veth.c index 966de5d6952..e6e972d9b7c 100644 --- a/drivers/net/iseries_veth.c +++ b/drivers/net/iseries_veth.c @@ -384,7 +384,7 @@ static struct attribute *veth_cnx_default_attrs[] = { NULL }; -static struct sysfs_ops veth_cnx_sysfs_ops = { +static const struct sysfs_ops veth_cnx_sysfs_ops = { .show = veth_cnx_attribute_show }; @@ -441,7 +441,7 @@ static struct attribute *veth_port_default_attrs[] = { NULL }; -static struct sysfs_ops veth_port_sysfs_ops = { +static const struct sysfs_ops veth_port_sysfs_ops = { .show = veth_port_attribute_show }; diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c index 0bc5d474b16..1062b8ffe24 100644 --- a/drivers/parisc/pdc_stable.c +++ b/drivers/parisc/pdc_stable.c @@ -481,7 +481,7 @@ pdcspath_attr_store(struct kobject *kobj, struct attribute *attr, return ret; } -static struct sysfs_ops pdcspath_attr_ops = { +static const struct sysfs_ops pdcspath_attr_ops = { .show = pdcspath_attr_show, .store = pdcspath_attr_store, }; diff --git a/drivers/pci/hotplug/fakephp.c b/drivers/pci/hotplug/fakephp.c index 6151389fd90..0a894efd4b9 100644 --- a/drivers/pci/hotplug/fakephp.c +++ b/drivers/pci/hotplug/fakephp.c @@ -73,7 +73,7 @@ static void legacy_release(struct kobject *kobj) } static struct kobj_type legacy_ktype = { - .sysfs_ops = &(struct sysfs_ops){ + .sysfs_ops = &(const struct sysfs_ops){ .store = legacy_store, .show = legacy_show }, .release = &legacy_release, diff --git a/drivers/pci/slot.c b/drivers/pci/slot.c index 49c9e6c9779..f75a44d37fb 100644 --- a/drivers/pci/slot.c +++ b/drivers/pci/slot.c @@ -29,7 +29,7 @@ static ssize_t pci_slot_attr_store(struct kobject *kobj, return attribute->store ? attribute->store(slot, buf, len) : -EIO; } -static struct sysfs_ops pci_slot_sysfs_ops = { +static const struct sysfs_ops pci_slot_sysfs_ops = { .show = pci_slot_attr_show, .store = pci_slot_attr_store, }; diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index e941367dd28..4de382acd8f 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -129,7 +129,7 @@ static ssize_t map_type_show(struct kobject *kobj, struct attribute *attr, return entry->show(mem, buf); } -static struct sysfs_ops map_sysfs_ops = { +static const struct sysfs_ops map_sysfs_ops = { .show = map_type_show, }; @@ -217,7 +217,7 @@ static ssize_t portio_type_show(struct kobject *kobj, struct attribute *attr, return entry->show(port, buf); } -static struct sysfs_ops portio_sysfs_ops = { +static const struct sysfs_ops portio_sysfs_ops = { .show = portio_type_show, }; diff --git a/drivers/uwb/wlp/sysfs.c b/drivers/uwb/wlp/sysfs.c index 0370399ff4b..6627c94cc85 100644 --- a/drivers/uwb/wlp/sysfs.c +++ b/drivers/uwb/wlp/sysfs.c @@ -615,8 +615,7 @@ ssize_t wlp_wss_attr_store(struct kobject *kobj, struct attribute *attr, return ret; } -static -struct sysfs_ops wss_sysfs_ops = { +static const struct sysfs_ops wss_sysfs_ops = { .show = wlp_wss_attr_show, .store = wlp_wss_attr_store, }; diff --git a/drivers/video/omap2/dss/manager.c b/drivers/video/omap2/dss/manager.c index 913142d4cab..9acef00c47e 100644 --- a/drivers/video/omap2/dss/manager.c +++ b/drivers/video/omap2/dss/manager.c @@ -341,7 +341,7 @@ static ssize_t manager_attr_store(struct kobject *kobj, struct attribute *attr, return manager_attr->store(manager, buf, size); } -static struct sysfs_ops manager_sysfs_ops = { +static const struct sysfs_ops manager_sysfs_ops = { .show = manager_attr_show, .store = manager_attr_store, }; diff --git a/drivers/video/omap2/dss/overlay.c b/drivers/video/omap2/dss/overlay.c index 0c5bea263ac..aed3f319434 100644 --- a/drivers/video/omap2/dss/overlay.c +++ b/drivers/video/omap2/dss/overlay.c @@ -320,7 +320,7 @@ static ssize_t overlay_attr_store(struct kobject *kobj, struct attribute *attr, return overlay_attr->store(overlay, buf, size); } -static struct sysfs_ops overlay_sysfs_ops = { +static const struct sysfs_ops overlay_sysfs_ops = { .show = overlay_attr_show, .store = overlay_attr_store, }; diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index ae5cb05a1a1..bb71ab2336c 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -426,7 +426,7 @@ static ssize_t hyp_sysfs_store(struct kobject *kobj, return 0; } -static struct sysfs_ops hyp_sysfs_ops = { +static const struct sysfs_ops hyp_sysfs_ops = { .show = hyp_sysfs_show, .store = hyp_sysfs_store, }; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index a240b6fa81d..4ce16ef702a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -164,12 +164,12 @@ static void btrfs_root_release(struct kobject *kobj) complete(&root->kobj_unregister); } -static struct sysfs_ops btrfs_super_attr_ops = { +static const struct sysfs_ops btrfs_super_attr_ops = { .show = btrfs_super_attr_show, .store = btrfs_super_attr_store, }; -static struct sysfs_ops btrfs_root_attr_ops = { +static const struct sysfs_ops btrfs_root_attr_ops = { .show = btrfs_root_attr_show, .store = btrfs_root_attr_store, }; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 26a8bd40400..f994a7dfda8 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -148,7 +148,7 @@ static void lockspace_kobj_release(struct kobject *k) kfree(ls); } -static struct sysfs_ops dlm_attr_ops = { +static const struct sysfs_ops dlm_attr_ops = { .show = dlm_attr_show, .store = dlm_attr_store, }; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2b83b96cb2e..ce84a6ed4a4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2358,7 +2358,7 @@ static void ext4_sb_release(struct kobject *kobj) } -static struct sysfs_ops ext4_attr_ops = { +static const struct sysfs_ops ext4_attr_ops = { .show = ext4_attr_show, .store = ext4_attr_store, }; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 543503010ed..419042f7f0b 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -49,7 +49,7 @@ static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr, return a->store ? a->store(sdp, buf, len) : len; } -static struct sysfs_ops gfs2_attr_ops = { +static const struct sysfs_ops gfs2_attr_ops = { .show = gfs2_attr_show, .store = gfs2_attr_store, }; diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index b39da877b12..3bb928a2bf7 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -136,7 +136,7 @@ static ssize_t mlog_store(struct kobject *obj, struct attribute *attr, return mlog_mask_store(mlog_attr->mask, buf, count); } -static struct sysfs_ops mlog_attr_ops = { +static const struct sysfs_ops mlog_attr_ops = { .show = mlog_show, .store = mlog_store, }; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 50b725bcc3f..ced2299f1c9 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -53,7 +53,7 @@ struct sysfs_buffer { size_t count; loff_t pos; char * page; - struct sysfs_ops * ops; + const struct sysfs_ops * ops; struct mutex mutex; int needs_read_fill; int event; @@ -75,7 +75,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer { struct sysfs_dirent *attr_sd = dentry->d_fsdata; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - struct sysfs_ops * ops = buffer->ops; + const struct sysfs_ops * ops = buffer->ops; int ret = 0; ssize_t count; @@ -199,7 +199,7 @@ flush_write_buffer(struct dentry * dentry, struct sysfs_buffer * buffer, size_t { struct sysfs_dirent *attr_sd = dentry->d_fsdata; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; - struct sysfs_ops * ops = buffer->ops; + const struct sysfs_ops * ops = buffer->ops; int rc; /* need attr_sd for attr and ops, its parent for kobj */ @@ -335,7 +335,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; struct sysfs_buffer *buffer; - struct sysfs_ops *ops; + const struct sysfs_ops *ops; int error = -EACCES; char *p; diff --git a/include/linux/kobject.h b/include/linux/kobject.h index 57a1eaae909..3950d3c2850 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -106,7 +106,7 @@ extern char *kobject_get_path(struct kobject *kobj, gfp_t flag); struct kobj_type { void (*release)(struct kobject *kobj); - struct sysfs_ops *sysfs_ops; + const struct sysfs_ops *sysfs_ops; struct attribute **default_attrs; }; @@ -132,7 +132,7 @@ struct kobj_attribute { const char *buf, size_t count); }; -extern struct sysfs_ops kobj_sysfs_ops; +extern const struct sysfs_ops kobj_sysfs_ops; /** * struct kset - a set of kobjects of a specific type, belonging to a specific subsystem. diff --git a/kernel/params.c b/kernel/params.c index 48370be3c0a..68396d73c83 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -722,7 +722,7 @@ static ssize_t module_attr_store(struct kobject *kobj, return ret; } -static struct sysfs_ops module_sysfs_ops = { +static const struct sysfs_ops module_sysfs_ops = { .show = module_attr_show, .store = module_attr_store, }; diff --git a/lib/kobject.c b/lib/kobject.c index cecf5a0ef6e..8115eb1bbf4 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -700,7 +700,7 @@ static ssize_t kobj_attr_store(struct kobject *kobj, struct attribute *attr, return ret; } -struct sysfs_ops kobj_sysfs_ops = { +const struct sysfs_ops kobj_sysfs_ops = { .show = kobj_attr_show, .store = kobj_attr_store, }; diff --git a/mm/slub.c b/mm/slub.c index a26753c12dc..a2b8969ba6d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4390,7 +4390,7 @@ static void kmem_cache_release(struct kobject *kobj) kfree(s); } -static struct sysfs_ops slab_sysfs_ops = { +static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, }; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1cf2cef7858..fef0384e3c0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -423,7 +423,7 @@ extern void br_ifinfo_notify(int event, struct net_bridge_port *port); #ifdef CONFIG_SYSFS /* br_sysfs_if.c */ -extern struct sysfs_ops brport_sysfs_ops; +extern const struct sysfs_ops brport_sysfs_ops; extern int br_sysfs_addif(struct net_bridge_port *p); /* br_sysfs_br.c */ diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 696596cd338..0b9916489d6 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -238,7 +238,7 @@ static ssize_t brport_store(struct kobject * kobj, return ret; } -struct sysfs_ops brport_sysfs_ops = { +const struct sysfs_ops brport_sysfs_ops = { .show = brport_show, .store = brport_store, }; diff --git a/samples/kobject/kset-example.c b/samples/kobject/kset-example.c index 7c608814052..3b126d1f859 100644 --- a/samples/kobject/kset-example.c +++ b/samples/kobject/kset-example.c @@ -87,7 +87,7 @@ static ssize_t foo_attr_store(struct kobject *kobj, } /* Our custom sysfs_ops that we will associate with our ktype later on */ -static struct sysfs_ops foo_sysfs_ops = { +static const struct sysfs_ops foo_sysfs_ops = { .show = foo_attr_show, .store = foo_attr_store, }; -- cgit v1.2.3-70-g09d2 From a07e4156a2ee6359d31a44946d7ee7f85dbf6bca Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 11 Feb 2010 15:23:05 -0800 Subject: sysfs: Use sysfs_attr_init and sysfs_bin_attr_init on dynamic attributes These are the non-static sysfs attributes that exist on my test machine. Fix them to use sysfs_attr_init or sysfs_bin_attr_init as appropriate. It simply requires making a sysfs attribute present to see this. So this is a little bit tedious but otherwise not too bad. Signed-off-by: Eric W. Biederman Acked-by: WANG Cong Cc: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/mcheck/mce.c | 1 + drivers/acpi/system.c | 2 ++ drivers/pci/pci-sysfs.c | 5 +++++ kernel/params.c | 1 + 4 files changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a8aacd4b513..28cba46bf32 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -2044,6 +2044,7 @@ static __init void mce_init_banks(void) struct mce_bank *b = &mce_banks[i]; struct sysdev_attribute *a = &b->attr; + sysfs_attr_init(&a->attr); a->attr.name = b->attrname; snprintf(b->attrname, ATTR_LEN, "bank%d", i); diff --git a/drivers/acpi/system.c b/drivers/acpi/system.c index a206a12da78..743f2445e2a 100644 --- a/drivers/acpi/system.c +++ b/drivers/acpi/system.c @@ -101,6 +101,7 @@ static void acpi_table_attr_init(struct acpi_table_attr *table_attr, struct acpi_table_header *header = NULL; struct acpi_table_attr *attr = NULL; + sysfs_attr_init(&table_attr->attr.attr); if (table_header->signature[0] != '\0') memcpy(table_attr->name, table_header->signature, ACPI_NAME_SIZE); @@ -475,6 +476,7 @@ void acpi_irq_stats_init(void) goto fail; strncpy(name, buffer, strlen(buffer) + 1); + sysfs_attr_init(&counter_attrs[i].attr); counter_attrs[i].attr.name = name; counter_attrs[i].attr.mode = 0644; counter_attrs[i].show = counter_show; diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 807224ec835..9fa183cfb0e 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -642,6 +642,7 @@ void pci_create_legacy_files(struct pci_bus *b) if (!b->legacy_io) goto kzalloc_err; + sysfs_bin_attr_init(&b->legacy_io); b->legacy_io->attr.name = "legacy_io"; b->legacy_io->size = 0xffff; b->legacy_io->attr.mode = S_IRUSR | S_IWUSR; @@ -654,6 +655,7 @@ void pci_create_legacy_files(struct pci_bus *b) goto legacy_io_err; /* Allocated above after the legacy_io struct */ + sysfs_bin_attr_init(&b->legacy_mem); b->legacy_mem = b->legacy_io + 1; b->legacy_mem->attr.name = "legacy_mem"; b->legacy_mem->size = 1024*1024; @@ -800,6 +802,7 @@ static int pci_create_attr(struct pci_dev *pdev, int num, int write_combine) if (res_attr) { char *res_attr_name = (char *)(res_attr + 1); + sysfs_bin_attr_init(res_attr); if (write_combine) { pdev->res_attr_wc[num] = res_attr; sprintf(res_attr_name, "resource%d_wc", num); @@ -972,6 +975,7 @@ static int pci_create_capabilities_sysfs(struct pci_dev *dev) if (!attr) return -ENOMEM; + sysfs_bin_attr_init(attr); attr->size = dev->vpd->len; attr->attr.name = "vpd"; attr->attr.mode = S_IRUSR | S_IWUSR; @@ -1038,6 +1042,7 @@ int __must_check pci_create_sysfs_dev_files (struct pci_dev *pdev) retval = -ENOMEM; goto err_resource_files; } + sysfs_bin_attr_init(attr); attr->size = rom_size; attr->attr.name = "rom"; attr->attr.mode = S_IRUSR; diff --git a/kernel/params.c b/kernel/params.c index 68396d73c83..d55a53ec923 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -516,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, new->grp.attrs = attrs; /* Tack new one on the end. */ + sysfs_attr_init(&new->attrs[num].mattr.attr); new->attrs[num].param = kp; new->attrs[num].mattr.show = param_attr_show; new->attrs[num].mattr.store = param_attr_store; -- cgit v1.2.3-70-g09d2