From 41b9eb264c8407655db57b60b4457fe1b2ec9977 Mon Sep 17 00:00:00 2001 From: Stefan Assmann Date: Tue, 15 Jul 2008 13:48:55 +0200 Subject: x86, pci: introduce config option for pci reroute quirks (was: [PATCH 0/3] Boot IRQ quirks for Broadcom and AMD/ATI) This is against linux-2.6-tip, branch pci-ioapic-boot-irq-quirks. From: Stefan Assmann Subject: Introduce config option for pci reroute quirks The config option X86_REROUTE_FOR_BROKEN_BOOT_IRQS is introduced to enable (or disable) the redirection of the interrupt handler to the boot interrupt line by default. Depending on the existence of interrupt masking / threaded interrupt handling in the kernel (vanilla, rt, ...) and the maturity of the rerouting patch, users can enable or disable the redirection by default. This means that the reroute quirk can be applied to any kernel without changing it. Interrupt sharing could be increased if this option is enabled. However this option is vital for threaded interrupt handling, as done by the RT kernel. It should simplify the consolidation with the RT kernel. The option can be overridden by either pci=ioapicreroute or pci=noioapicreroute. Signed-off-by: Stefan Assmann Signed-off-by: Olaf Dabrunz Cc: Jesse Barnes Cc: Jon Masters Cc: Ihno Krumreich Cc: Sven Dietrich Cc: Daniel Gollub Cc: Felix Foerster Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 96e0c2ebc38..09521332636 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -665,6 +665,30 @@ config X86_VISWS_APIC def_bool y depends on X86_32 && X86_VISWS +config X86_REROUTE_FOR_BROKEN_BOOT_IRQS + bool "Reroute for broken boot IRQs" + default n + depends on X86_IO_APIC + help + This option enables a workaround that fixes a source of + spurious interrupts. This is recommended when threaded + interrupt handling is used on systems where the generation of + superfluous "boot interrupts" cannot be disabled. + + Some chipsets generate a legacy INTx "boot IRQ" when the IRQ + entry in the chipset's IO-APIC is masked (as, e.g. the RT + kernel does during interrupt handling). On chipsets where this + boot IRQ generation cannot be disabled, this workaround keeps + the original IRQ line masked so that only the equivalent "boot + IRQ" is delivered to the CPUs. The workaround also tells the + kernel to set up the IRQ handler on the boot IRQ line. In this + way only one interrupt is delivered to the kernel. Otherwise + the spurious second interrupt may cause the kernel to bring + down (vital) interrupt lines. + + Only affects "broken" chipsets. Interrupt sharing may be + increased on these systems. + config X86_MCE bool "Machine Check Exception" depends on !X86_VOYAGER -- cgit v1.2.3-70-g09d2 From 600715dcdf567c86f8b2c6173fcfb4b873e25a19 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 11 Sep 2008 01:31:45 -0700 Subject: generic: add phys_addr_t for holding physical addresses Add a kernel-wide "phys_addr_t" which is guaranteed to be able to hold any physical address. By default it equals the word size of the architecture, but a 32-bit architecture can set ARCH_PHYS_ADDR_T_64BIT if it needs a 64-bit phys_addr_t. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/powerpc/Kconfig | 3 +++ arch/powerpc/include/asm/types.h | 7 ------- arch/x86/Kconfig | 3 +++ include/asm-x86/page_32.h | 2 -- include/asm-x86/page_64.h | 1 - include/linux/types.h | 6 ++++++ mm/Kconfig | 3 +++ 7 files changed, 15 insertions(+), 10 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 587da5e0990..f5f83ee6041 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -22,6 +22,9 @@ config WORD_SIZE config PPC_MERGE def_bool y +config ARCH_PHYS_ADDR_T_64BIT + def_bool PPC64 || PHYS_64BIT + config MMU bool default y diff --git a/arch/powerpc/include/asm/types.h b/arch/powerpc/include/asm/types.h index d3374bc865b..c646f34c4e8 100644 --- a/arch/powerpc/include/asm/types.h +++ b/arch/powerpc/include/asm/types.h @@ -48,13 +48,6 @@ typedef struct { typedef __vector128 vector128; -/* Physical address used by some IO functions */ -#if defined(CONFIG_PPC64) || defined(CONFIG_PHYS_64BIT) -typedef u64 phys_addr_t; -#else -typedef u32 phys_addr_t; -#endif - #ifdef __powerpc64__ typedef u64 dma_addr_t; #else diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d132..a0ffb5188c8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -932,6 +932,9 @@ config X86_PAE has the cost of more pagetable lookup overhead, and also consumes more pagetable space per process. +config ARCH_PHYS_ADDR_T_64BIT + def_bool X86_64 || X86_PAE + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" diff --git a/include/asm-x86/page_32.h b/include/asm-x86/page_32.h index ab8528793f0..d0e58605a52 100644 --- a/include/asm-x86/page_32.h +++ b/include/asm-x86/page_32.h @@ -33,7 +33,6 @@ typedef u64 pmdval_t; typedef u64 pudval_t; typedef u64 pgdval_t; typedef u64 pgprotval_t; -typedef u64 phys_addr_t; typedef union { struct { @@ -54,7 +53,6 @@ typedef unsigned long pmdval_t; typedef unsigned long pudval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; -typedef unsigned long phys_addr_t; typedef union { pteval_t pte; diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h index c6916c83e6b..2456fbf2d7d 100644 --- a/include/asm-x86/page_64.h +++ b/include/asm-x86/page_64.h @@ -79,7 +79,6 @@ typedef unsigned long pmdval_t; typedef unsigned long pudval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; -typedef unsigned long phys_addr_t; typedef struct page *pgtable_t; diff --git a/include/linux/types.h b/include/linux/types.h index d4a9ce6e276..022c668496d 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -197,6 +197,12 @@ typedef u64 resource_size_t; typedef u32 resource_size_t; #endif +#ifdef CONFIG_PHYS_ADDR_T_64BIT +typedef u64 phys_addr_t; +#else +typedef u32 phys_addr_t; +#endif + struct ustat { __kernel_daddr_t f_tfree; __kernel_ino_t f_tinode; diff --git a/mm/Kconfig b/mm/Kconfig index 0bd9c2dbb2a..91ee3922510 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -187,6 +187,9 @@ config RESOURCES_64BIT help This option allows memory and IO resources to be 64 bit. +config PHYS_ADDR_T_64BIT + def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT + config ZONE_DMA_FLAG int default "0" if !ZONE_DMA -- cgit v1.2.3-70-g09d2 From 8308c54d7e312f7a03e2ce2057d0837e6fe3843f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 11 Sep 2008 01:31:50 -0700 Subject: generic: redefine resource_size_t as phys_addr_t There's no good reason why a resource_size_t shouldn't just be a physical address, so simply redefine it in terms of phys_addr_t. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/powerpc/platforms/Kconfig.cputype | 1 - arch/powerpc/sysdev/ppc4xx_pci.c | 16 ++++++---------- arch/x86/Kconfig | 1 - arch/x86/kernel/e820.c | 4 +--- drivers/pci/setup-bus.c | 9 ++++----- include/linux/types.h | 8 ++------ 6 files changed, 13 insertions(+), 26 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 7f651273386..be852fd407a 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -135,7 +135,6 @@ config PTE_64BIT config PHYS_64BIT bool 'Large physical address support' if E500 depends on 44x || E500 - select RESOURCES_64BIT default y if 44x ---help--- This option enables kernel support for larger than 32-bit physical diff --git a/arch/powerpc/sysdev/ppc4xx_pci.c b/arch/powerpc/sysdev/ppc4xx_pci.c index fb368dfde5d..e8a76d9539d 100644 --- a/arch/powerpc/sysdev/ppc4xx_pci.c +++ b/arch/powerpc/sysdev/ppc4xx_pci.c @@ -41,13 +41,10 @@ extern unsigned long total_memory; #define U64_TO_U32_LOW(val) ((u32)((val) & 0x00000000ffffffffULL)) #define U64_TO_U32_HIGH(val) ((u32)((val) >> 32)) -#ifdef CONFIG_RESOURCES_64BIT -#define RES_TO_U32_LOW(val) U64_TO_U32_LOW(val) -#define RES_TO_U32_HIGH(val) U64_TO_U32_HIGH(val) -#else -#define RES_TO_U32_LOW(val) (val) -#define RES_TO_U32_HIGH(val) (0) -#endif +#define RES_TO_U32_LOW(val) \ + ((sizeof(resource_size_t) > sizeof(u32)) ? U64_TO_U32_LOW(val) : (val)) +#define RES_TO_U32_HIGH(val) \ + ((sizeof(resource_size_t) > sizeof(u32)) ? U64_TO_U32_HIGH(val) : (0)) static inline int ppc440spe_revA(void) { @@ -145,12 +142,11 @@ static int __init ppc4xx_parse_dma_ranges(struct pci_controller *hose, /* Use that */ res->start = pci_addr; -#ifndef CONFIG_RESOURCES_64BIT /* Beware of 32 bits resources */ - if ((pci_addr + size) > 0x100000000ull) + if (sizeof(resource_size_t) == sizeof(u32) && + (pci_addr + size) > 0x100000000ull) res->end = 0xffffffff; else -#endif res->end = res->start + size - 1; break; } diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a0ffb5188c8..b4e1875f986 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -925,7 +925,6 @@ config X86_PAE def_bool n prompt "PAE (Physical Address Extension) Support" depends on X86_32 && !HIGHMEM4G - select RESOURCES_64BIT help PAE is required for NX support, and furthermore enables larger swapspace support for non-overcommit purposes. It diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 66e48aa2dd1..477f4bb7e55 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1276,12 +1276,10 @@ void __init e820_reserve_resources(void) res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; -#ifndef CONFIG_RESOURCES_64BIT - if (end > 0x100000000ULL) { + if (end != (resource_size_t)end) { res++; continue; } -#endif res->name = e820_type_to_string(e820.map[i].type); res->start = e820.map[i].addr; res->end = end; diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 1aad599816f..f250a90ee45 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -378,11 +378,10 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, unsigned long align = 0; min_align = 0; for (order = 0; order <= max_order; order++) { -#ifdef CONFIG_RESOURCES_64BIT - resource_size_t align1 = 1ULL << (order + 20); -#else - resource_size_t align1 = 1U << (order + 20); -#endif + resource_size_t align1 = 1; + + align1 <<= (order + 20); + if (!align) min_align = align1; else if (ALIGN(align + min_align, min_align) < align1) diff --git a/include/linux/types.h b/include/linux/types.h index 022c668496d..f24f7beb47d 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -191,18 +191,14 @@ typedef __u32 __bitwise __wsum; #ifdef __KERNEL__ typedef unsigned __bitwise__ gfp_t; -#ifdef CONFIG_RESOURCES_64BIT -typedef u64 resource_size_t; -#else -typedef u32 resource_size_t; -#endif - #ifdef CONFIG_PHYS_ADDR_T_64BIT typedef u64 phys_addr_t; #else typedef u32 phys_addr_t; #endif +typedef phys_addr_t resource_size_t; + struct ustat { __kernel_daddr_t f_tfree; __kernel_ino_t f_tinode; -- cgit v1.2.3-70-g09d2 From e4b2b8866121bd06d5f3d9de0f79a04339ffa252 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 15:45:11 -0400 Subject: ftrace: enable using mcount recording on x86 Enable the use of the __mcount_loc infrastructure on x86_64 and i386. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fc8351f374f..5a34c5427a0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -25,6 +25,7 @@ config X86 select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB select HAVE_KRETPROBES + select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) -- cgit v1.2.3-70-g09d2 From 6da55c3e8da88e8a7cb6452160776ad6706798ad Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:46 -0700 Subject: x86: enable dyn_array support Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/Kconfig | 2 ++ arch/x86/Kconfig | 1 + arch/x86/kernel/vmlinux_32.lds.S | 1 + arch/x86/kernel/vmlinux_64.lds.S | 3 +++ 4 files changed, 7 insertions(+) (limited to 'arch/x86/Kconfig') diff --git a/arch/Kconfig b/arch/Kconfig index 0267babe5eb..c1f9febb404 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -103,3 +103,5 @@ config HAVE_CLK The calls support software clock gating and thus are a key power management tool on many systems. +config HAVE_DYN_ARRAY + def_bool n diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f65c2744d57..42f98009d75 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -33,6 +33,7 @@ config X86 select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_DYN_ARRAY config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index a9b8560adbc..c36007ab394 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -145,6 +145,7 @@ SECTIONS *(.x86_cpu_dev.init) __x86_cpu_dev_end = .; } + DYN_ARRAY_INIT(8) SECURITY_INIT . = ALIGN(4); .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 46e05447405..30973dbac8c 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -173,6 +173,9 @@ SECTIONS *(.x86_cpu_dev.init) } __x86_cpu_dev_end = .; + + DYN_ARRAY_INIT(8) + SECURITY_INIT . = ALIGN(8); -- cgit v1.2.3-70-g09d2 From 08678b0841267c1d00d771fe01548d86043d065e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:05 -0700 Subject: generic: sparse irqs: use irq_desc() together with dyn_array, instead of irq_desc[] add CONFIG_HAVE_SPARSE_IRQ to for use condensed array. Get rid of irq_desc[] array assumptions. Preallocate 32 irq_desc, and irq_desc() will try to get more. ( No change in functionality is expected anywhere, except the odd build failure where we missed a code site or where a crossing commit itroduces new irq_desc[] usage. ) v2: according to Eric, change get_irq_desc() to irq_desc() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/Kconfig | 4 ++ arch/x86/Kconfig | 1 + arch/x86/kernel/io_apic_32.c | 46 ++++++++---- arch/x86/kernel/io_apic_64.c | 75 +++++++++++++------- arch/x86/kernel/irq_32.c | 24 ++++--- arch/x86/kernel/irq_64.c | 35 ++++----- arch/x86/kernel/irqinit_64.c | 10 +-- arch/x86/kernel/visws_quirks.c | 30 ++++---- arch/x86/mach-voyager/voyager_smp.c | 4 +- drivers/gpio/gpiolib.c | 2 +- drivers/mfd/asic3.c | 4 +- drivers/mfd/htc-egpio.c | 2 +- drivers/parisc/dino.c | 6 +- drivers/parisc/eisa.c | 4 +- drivers/parisc/gsc.c | 12 ++-- drivers/parisc/iosapic.c | 4 +- drivers/parisc/superio.c | 4 +- drivers/pcmcia/hd64465_ss.c | 12 +++- drivers/xen/events.c | 8 ++- include/linux/irq.h | 32 ++++++--- kernel/irq/autoprobe.c | 10 +-- kernel/irq/chip.c | 32 +++++---- kernel/irq/handle.c | 138 ++++++++++++++++++++++++++++++++---- kernel/irq/manage.c | 35 +++++---- kernel/irq/migration.c | 14 ++-- kernel/irq/proc.c | 36 +++++----- kernel/irq/resend.c | 2 +- kernel/irq/spurious.c | 5 +- 28 files changed, 404 insertions(+), 187 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/Kconfig b/arch/Kconfig index c1f9febb404..b3676224626 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -105,3 +105,7 @@ config HAVE_CLK config HAVE_DYN_ARRAY def_bool n + +config HAVE_SPARSE_IRQ + def_bool n + diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 42f98009d75..1004888e9b1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,6 +34,7 @@ config X86 select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_DYN_ARRAY + select HAVE_SPARSE_IRQ if X86_64 config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 7f2bcc3dad8..c2160cfdec9 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -345,6 +345,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) struct irq_pin_list *entry = irq_2_pin + irq; unsigned int apicid_value; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, cpumask, cpu_online_map); if (cpus_empty(tmp)) @@ -365,7 +366,8 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) break; entry = irq_2_pin + entry->next; } - irq_desc[irq].affinity = cpumask; + desc = irq_to_desc(irq); + desc->affinity = cpumask; spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -475,10 +477,12 @@ static inline void balance_irq(int cpu, int irq) static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) { int i, j; + struct irq_desc *desc; for_each_online_cpu(i) { for (j = 0; j < nr_irqs; j++) { - if (!irq_desc[j].action) + desc = irq_to_desc(j); + if (!desc->action) continue; /* Is it a significant load ? */ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) < @@ -505,6 +509,7 @@ static void do_irq_balance(void) unsigned long tmp_cpu_irq; unsigned long imbalance = 0; cpumask_t allowed_mask, target_cpu_mask, tmp; + struct irq_desc *desc; for_each_possible_cpu(i) { int package_index; @@ -515,7 +520,8 @@ static void do_irq_balance(void) for (j = 0; j < nr_irqs; j++) { unsigned long value_now, delta; /* Is this an active IRQ or balancing disabled ? */ - if (!irq_desc[j].action || irq_balancing_disabled(j)) + desc = irq_to_desc(j); + if (!desc->action || irq_balancing_disabled(j)) continue; if (package_index == i) IRQ_DELTA(package_index, j) = 0; @@ -609,7 +615,8 @@ tryanotherirq: selected_irq = -1; for (j = 0; j < nr_irqs; j++) { /* Is this an active IRQ? */ - if (!irq_desc[j].action) + desc = irq_to_desc(j); + if (!desc->action) continue; if (imbalance <= IRQ_DELTA(max_loaded, j)) continue; @@ -682,10 +689,12 @@ static int balanced_irq(void *unused) int i; unsigned long prev_balance_time = jiffies; long time_remaining = balanced_irq_interval; + struct irq_desc *desc; /* push everything to CPU 0 to give us a starting point. */ for (i = 0 ; i < nr_irqs ; i++) { - irq_desc[i].pending_mask = cpumask_of_cpu(0); + desc = irq_to_desc(i); + desc->pending_mask = cpumask_of_cpu(0); set_pending_irq(i, cpumask_of_cpu(0)); } @@ -1254,13 +1263,16 @@ static struct irq_chip ioapic_chip; static void ioapic_register_intr(int irq, int vector, unsigned long trigger) { + struct irq_desc *desc; + + desc = irq_to_desc(irq); if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) { - irq_desc[irq].status |= IRQ_LEVEL; + desc->status |= IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); } else { - irq_desc[irq].status &= ~IRQ_LEVEL; + desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); } @@ -2027,6 +2039,7 @@ static struct irq_chip ioapic_chip __read_mostly = { static inline void init_IO_APIC_traps(void) { int irq; + struct irq_desc *desc; /* * NOTE! The local APIC isn't very good at handling @@ -2048,9 +2061,11 @@ static inline void init_IO_APIC_traps(void) */ if (irq < 16) make_8259A_irq(irq); - else + else { + desc = irq_to_desc(irq); /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_chip; + desc->chip = &no_irq_chip; + } } } } @@ -2089,7 +2104,10 @@ static struct irq_chip lapic_chip __read_mostly = { static void lapic_register_intr(int irq, int vector) { - irq_desc[irq].status &= ~IRQ_LEVEL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); set_intr_gate(vector, interrupt[irq]); @@ -2556,6 +2574,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) unsigned int dest; cpumask_t tmp; int vector; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2575,7 +2594,8 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg(irq, &msg); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif /* CONFIG_SMP */ @@ -2649,6 +2669,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) { unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2659,7 +2680,8 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) dest = cpu_mask_to_apicid(mask); target_ht_irq(irq, dest); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 93a3ffabfe6..cab5a25d81b 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -345,6 +345,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) unsigned long flags; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -361,9 +362,10 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) */ dest = SET_APIC_LOGICAL_ID(dest); + desc = irq_to_desc(irq); spin_lock_irqsave(&ioapic_lock, flags); __target_IO_APIC_irq(irq, dest, cfg->vector); - irq_desc[irq].affinity = mask; + desc->affinity = mask; spin_unlock_irqrestore(&ioapic_lock, flags); } #endif @@ -933,14 +935,17 @@ static struct irq_chip ir_ioapic_chip; static void ioapic_register_intr(int irq, unsigned long trigger) { + struct irq_desc *desc; + + desc = irq_to_desc(irq); if (trigger) - irq_desc[irq].status |= IRQ_LEVEL; + desc->status |= IRQ_LEVEL; else - irq_desc[irq].status &= ~IRQ_LEVEL; + desc->status &= ~IRQ_LEVEL; #ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { - irq_desc[irq].status |= IRQ_MOVE_PCNTXT; + desc->status |= IRQ_MOVE_PCNTXT; if (trigger) set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, handle_fasteoi_irq, @@ -1596,10 +1601,10 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); static void migrate_ioapic_irq(int irq, cpumask_t mask) { struct irq_cfg *cfg = irq_cfg + irq; - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc; cpumask_t tmp, cleanup_mask; struct irte irte; - int modify_ioapic_rte = desc->status & IRQ_LEVEL; + int modify_ioapic_rte; unsigned int dest; unsigned long flags; @@ -1616,6 +1621,8 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask) cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); + desc = irq_to_desc(irq); + modify_ioapic_rte = desc->status & IRQ_LEVEL; if (modify_ioapic_rte) { spin_lock_irqsave(&ioapic_lock, flags); __target_IO_APIC_irq(irq, dest, cfg->vector); @@ -1637,12 +1644,13 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask) cfg->move_in_progress = 0; } - irq_desc[irq].affinity = mask; + desc->affinity = mask; } static int migrate_irq_remapped_level(int irq) { int ret = -1; + struct irq_desc *desc = irq_to_desc(irq); mask_IO_APIC_irq(irq); @@ -1658,11 +1666,11 @@ static int migrate_irq_remapped_level(int irq) } /* everthing is clear. we have right of way */ - migrate_ioapic_irq(irq, irq_desc[irq].pending_mask); + migrate_ioapic_irq(irq, desc->pending_mask); ret = 0; - irq_desc[irq].status &= ~IRQ_MOVE_PENDING; - cpus_clear(irq_desc[irq].pending_mask); + desc->status &= ~IRQ_MOVE_PENDING; + cpus_clear(desc->pending_mask); unmask: unmask_IO_APIC_irq(irq); @@ -1674,7 +1682,7 @@ static void ir_irq_migration(struct work_struct *work) int irq; for (irq = 0; irq < nr_irqs; irq++) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (desc->status & IRQ_MOVE_PENDING) { unsigned long flags; @@ -1686,8 +1694,7 @@ static void ir_irq_migration(struct work_struct *work) continue; } - desc->chip->set_affinity(irq, - irq_desc[irq].pending_mask); + desc->chip->set_affinity(irq, desc->pending_mask); spin_unlock_irqrestore(&desc->lock, flags); } } @@ -1698,9 +1705,11 @@ static void ir_irq_migration(struct work_struct *work) */ static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { - if (irq_desc[irq].status & IRQ_LEVEL) { - irq_desc[irq].status |= IRQ_MOVE_PENDING; - irq_desc[irq].pending_mask = mask; + struct irq_desc *desc = irq_to_desc(irq); + + if (desc->status & IRQ_LEVEL) { + desc->status |= IRQ_MOVE_PENDING; + desc->pending_mask = mask; migrate_irq_remapped_level(irq); return; } @@ -1725,7 +1734,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) if (irq >= nr_irqs) continue; - desc = irq_desc + irq; + desc = irq_to_desc(irq); cfg = irq_cfg + irq; spin_lock(&desc->lock); if (!cfg->move_cleanup_count) @@ -1791,7 +1800,7 @@ static void ack_apic_level(unsigned int irq) irq_complete_move(irq); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { + if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; mask_IO_APIC_irq(irq); } @@ -1868,6 +1877,7 @@ static struct irq_chip ir_ioapic_chip __read_mostly = { static inline void init_IO_APIC_traps(void) { int irq; + struct irq_desc *desc; /* * NOTE! The local APIC isn't very good at handling @@ -1889,9 +1899,11 @@ static inline void init_IO_APIC_traps(void) */ if (irq < 16) make_8259A_irq(irq); - else + else { + desc = irq_to_desc(irq); /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_chip; + desc->chip = &no_irq_chip; + } } } } @@ -1926,7 +1938,10 @@ static struct irq_chip lapic_chip __read_mostly = { static void lapic_register_intr(int irq) { - irq_desc[irq].status &= ~IRQ_LEVEL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); } @@ -2402,6 +2417,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) struct msi_msg msg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2421,7 +2437,8 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg(irq, &msg); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #ifdef CONFIG_INTR_REMAP @@ -2435,6 +2452,7 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) unsigned int dest; cpumask_t tmp, cleanup_mask; struct irte irte; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2469,7 +2487,8 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) cfg->move_in_progress = 0; } - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif #endif /* CONFIG_SMP */ @@ -2543,7 +2562,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) #ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); /* * irq migration in process context */ @@ -2655,6 +2674,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) struct msi_msg msg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2674,7 +2694,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif /* CONFIG_SMP */ @@ -2731,6 +2752,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) struct irq_cfg *cfg = irq_cfg + irq; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2743,7 +2765,8 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) dest = cpu_mask_to_apicid(tmp); target_ht_irq(irq, dest, cfg->vector); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 4c7ffb32854..ede513be517 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -224,7 +224,7 @@ unsigned int do_IRQ(struct pt_regs *regs) struct pt_regs *old_regs; /* high bit used in ret_from_ code */ int overflow, irq = ~regs->orig_ax; - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (unlikely((unsigned)irq >= nr_irqs)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", @@ -273,15 +273,16 @@ int show_interrupts(struct seq_file *p, void *v) if (i < nr_irqs) { unsigned any_count = 0; + struct irq_desc *desc = irq_to_desc(i); - spin_lock_irqsave(&irq_desc[i].lock, flags); + spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP any_count = kstat_irqs(i); #else for_each_online_cpu(j) any_count |= kstat_cpu(j).irqs[i]; #endif - action = irq_desc[i].action; + action = desc->action; if (!action && !any_count) goto skip; seq_printf(p, "%3d: ",i); @@ -291,8 +292,8 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif - seq_printf(p, " %8s", irq_desc[i].chip->name); - seq_printf(p, "-%-8s", irq_desc[i].name); + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); if (action) { seq_printf(p, " %s", action->name); @@ -302,7 +303,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + spin_unlock_irqrestore(&desc->lock, flags); } else if (i == nr_irqs) { seq_printf(p, "NMI: "); for_each_online_cpu(j) @@ -398,17 +399,20 @@ void fixup_irqs(cpumask_t map) for (irq = 0; irq < nr_irqs; irq++) { cpumask_t mask; + struct irq_desc *desc; + if (irq == 2) continue; - cpus_and(mask, irq_desc[irq].affinity, map); + desc = irq_to_desc(irq); + cpus_and(mask, desc->affinity, map); if (any_online_cpu(mask) == NR_CPUS) { printk("Breaking affinity for irq %i\n", irq); mask = map; } - if (irq_desc[irq].chip->set_affinity) - irq_desc[irq].chip->set_affinity(irq, mask); - else if (irq_desc[irq].action && !(warned++)) + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, mask); + else if (desc->action && !(warned++)) printk("Cannot set affinity for irq %i\n", irq); } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index e1f0839430d..738eb65a924 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -83,15 +83,16 @@ int show_interrupts(struct seq_file *p, void *v) if (i < nr_irqs) { unsigned any_count = 0; + struct irq_desc *desc = irq_to_desc(i); - spin_lock_irqsave(&irq_desc[i].lock, flags); + spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP any_count = kstat_irqs(i); #else for_each_online_cpu(j) any_count |= kstat_cpu(j).irqs[i]; #endif - action = irq_desc[i].action; + action = desc->action; if (!action && !any_count) goto skip; seq_printf(p, "%3d: ",i); @@ -101,8 +102,8 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif - seq_printf(p, " %8s", irq_desc[i].chip->name); - seq_printf(p, "-%-8s", irq_desc[i].name); + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); if (action) { seq_printf(p, " %s", action->name); @@ -111,7 +112,7 @@ int show_interrupts(struct seq_file *p, void *v) } seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + spin_unlock_irqrestore(&desc->lock, flags); } else if (i == nr_irqs) { seq_printf(p, "NMI: "); for_each_online_cpu(j) @@ -228,37 +229,39 @@ void fixup_irqs(cpumask_t map) cpumask_t mask; int break_affinity = 0; int set_affinity = 1; + struct irq_desc *desc; if (irq == 2) continue; + desc = irq_to_desc(irq); /* interrupt's are disabled at this point */ - spin_lock(&irq_desc[irq].lock); + spin_lock(&desc->lock); if (!irq_has_action(irq) || - cpus_equal(irq_desc[irq].affinity, map)) { - spin_unlock(&irq_desc[irq].lock); + cpus_equal(desc->affinity, map)) { + spin_unlock(&desc->lock); continue; } - cpus_and(mask, irq_desc[irq].affinity, map); + cpus_and(mask, desc->affinity, map); if (cpus_empty(mask)) { break_affinity = 1; mask = map; } - if (irq_desc[irq].chip->mask) - irq_desc[irq].chip->mask(irq); + if (desc->chip->mask) + desc->chip->mask(irq); - if (irq_desc[irq].chip->set_affinity) - irq_desc[irq].chip->set_affinity(irq, mask); + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, mask); else if (!(warned++)) set_affinity = 0; - if (irq_desc[irq].chip->unmask) - irq_desc[irq].chip->unmask(irq); + if (desc->chip->unmask) + desc->chip->unmask(irq); - spin_unlock(&irq_desc[irq].lock); + spin_unlock(&desc->lock); if (break_affinity && set_affinity) printk("Broke affinity for irq %i\n", irq); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 165c5d9b0d1..0744b49b4d1 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -143,9 +143,11 @@ void __init init_ISA_irqs(void) init_8259A(0); for (i = 0; i < nr_irqs; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; if (i < 16) { /* @@ -157,7 +159,7 @@ void __init init_ISA_irqs(void) /* * 'high' PCI IRQs filled in on demand */ - irq_desc[i].chip = &no_irq_chip; + desc->chip = &no_irq_chip; } } } diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 61a97e616f7..9d85ab38443 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -484,10 +484,11 @@ static void disable_cobalt_irq(unsigned int irq) static unsigned int startup_cobalt_irq(unsigned int irq) { unsigned long flags; + struct irq_desc *desc = irq_to_desc(irq); spin_lock_irqsave(&cobalt_lock, flags); - if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) - irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); + if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) + desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); enable_cobalt_irq(irq); spin_unlock_irqrestore(&cobalt_lock, flags); return 0; @@ -506,9 +507,10 @@ static void ack_cobalt_irq(unsigned int irq) static void end_cobalt_irq(unsigned int irq) { unsigned long flags; + struct irq_desc *desc = irq_to_desc(irq); spin_lock_irqsave(&cobalt_lock, flags); - if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS))) + if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS))) enable_cobalt_irq(irq); spin_unlock_irqrestore(&cobalt_lock, flags); } @@ -626,7 +628,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) spin_unlock_irqrestore(&i8259A_lock, flags); - desc = irq_desc + realirq; + desc = irq_to_desc(realirq); /* * handle this 'virtual interrupt' as a Cobalt one now. @@ -662,27 +664,29 @@ void init_VISWS_APIC_irqs(void) int i; for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = 0; - irq_desc[i].depth = 1; + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = 0; + desc->depth = 1; if (i == 0) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } else if (i == CO_IRQ_IDE0) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } else if (i == CO_IRQ_IDE1) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } else if (i == CO_IRQ_8259) { - irq_desc[i].chip = &piix4_master_irq_type; + desc->chip = &piix4_master_irq_type; } else if (i < CO_IRQ_APIC0) { - irq_desc[i].chip = &piix4_virtual_irq_type; + desc->chip = &piix4_virtual_irq_type; } else if (IS_CO_APIC(i)) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } } diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 199a5f4a873..0f6e8a6523a 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -1483,7 +1483,7 @@ static void disable_local_vic_irq(unsigned int irq) * the interrupt off to another CPU */ static void before_handle_vic_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + irq_desc_t *desc = irq_to_desc(irq); __u8 cpu = smp_processor_id(); _raw_spin_lock(&vic_irq_lock); @@ -1518,7 +1518,7 @@ static void before_handle_vic_irq(unsigned int irq) /* Finish the VIC interrupt: basically mask */ static void after_handle_vic_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + irq_desc_t *desc = irq_to_desc(irq); _raw_spin_lock(&vic_irq_lock); { diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c index 8d2940517c9..572d372899d 100644 --- a/drivers/gpio/gpiolib.c +++ b/drivers/gpio/gpiolib.c @@ -1058,7 +1058,7 @@ static void gpiolib_dbg_show(struct seq_file *s, struct gpio_chip *chip) if (!is_out) { int irq = gpio_to_irq(gpio); - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); /* This races with request_irq(), set_irq_type(), * and set_irq_wake() ... but those are "rare". diff --git a/drivers/mfd/asic3.c b/drivers/mfd/asic3.c index ba5aa200827..e4c0db4dc7b 100644 --- a/drivers/mfd/asic3.c +++ b/drivers/mfd/asic3.c @@ -123,7 +123,7 @@ static void asic3_irq_demux(unsigned int irq, struct irq_desc *desc) irqnr = asic->irq_base + (ASIC3_GPIOS_PER_BANK * bank) + i; - desc = irq_desc + irqnr; + desc = irq_to_desc(irqnr); desc->handle_irq(irqnr, desc); if (asic->irq_bothedge[bank] & bit) asic3_irq_flip_edge(asic, base, @@ -136,7 +136,7 @@ static void asic3_irq_demux(unsigned int irq, struct irq_desc *desc) for (i = ASIC3_NUM_GPIOS; i < ASIC3_NR_IRQS; i++) { /* They start at bit 4 and go up */ if (status & (1 << (i - ASIC3_NUM_GPIOS + 4))) { - desc = irq_desc + asic->irq_base + i; + desc = irq_to_desc(asic->irq_base + i); desc->handle_irq(asic->irq_base + i, desc); } diff --git a/drivers/mfd/htc-egpio.c b/drivers/mfd/htc-egpio.c index 6be43172dc6..ad3379fcd19 100644 --- a/drivers/mfd/htc-egpio.c +++ b/drivers/mfd/htc-egpio.c @@ -112,7 +112,7 @@ static void egpio_handler(unsigned int irq, struct irq_desc *desc) /* Run irq handler */ pr_debug("got IRQ %d\n", irqpin); irq = ei->irq_start + irqpin; - desc = &irq_desc[irq]; + desc = irq_to_desc(irq); desc->handle_irq(irq, desc); } } diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c index fd56128525d..3bc54b30c3a 100644 --- a/drivers/parisc/dino.c +++ b/drivers/parisc/dino.c @@ -298,7 +298,8 @@ struct pci_port_ops dino_port_ops = { static void dino_disable_irq(unsigned int irq) { - struct dino_device *dino_dev = irq_desc[irq].chip_data; + struct irq_desc *desc = irq_to_desc(irq); + struct dino_device *dino_dev = desc->chip_data; int local_irq = gsc_find_local_irq(irq, dino_dev->global_irq, DINO_LOCAL_IRQS); DBG(KERN_WARNING "%s(0x%p, %d)\n", __func__, dino_dev, irq); @@ -310,7 +311,8 @@ static void dino_disable_irq(unsigned int irq) static void dino_enable_irq(unsigned int irq) { - struct dino_device *dino_dev = irq_desc[irq].chip_data; + struct irq_desc *desc = irq_to_desc(irq); + struct dino_device *dino_dev = desc->chip_data; int local_irq = gsc_find_local_irq(irq, dino_dev->global_irq, DINO_LOCAL_IRQS); u32 tmp; diff --git a/drivers/parisc/eisa.c b/drivers/parisc/eisa.c index 771cef59254..7891db50c48 100644 --- a/drivers/parisc/eisa.c +++ b/drivers/parisc/eisa.c @@ -346,10 +346,10 @@ static int __init eisa_probe(struct parisc_device *dev) } /* Reserve IRQ2 */ - irq_desc[2].action = &irq2_action; + irq_to_desc(2)->action = &irq2_action; for (i = 0; i < 16; i++) { - irq_desc[i].chip = &eisa_interrupt_type; + irq_to_desc(i)->chip = &eisa_interrupt_type; } EISA_bus = 1; diff --git a/drivers/parisc/gsc.c b/drivers/parisc/gsc.c index f7d088b897e..e76db9e4d50 100644 --- a/drivers/parisc/gsc.c +++ b/drivers/parisc/gsc.c @@ -108,7 +108,8 @@ int gsc_find_local_irq(unsigned int irq, int *global_irqs, int limit) static void gsc_asic_disable_irq(unsigned int irq) { - struct gsc_asic *irq_dev = irq_desc[irq].chip_data; + struct irq_desc *desc = irq_to_desc(irq); + struct gsc_asic *irq_dev = desc->chip_data; int local_irq = gsc_find_local_irq(irq, irq_dev->global_irq, 32); u32 imr; @@ -123,7 +124,8 @@ static void gsc_asic_disable_irq(unsigned int irq) static void gsc_asic_enable_irq(unsigned int irq) { - struct gsc_asic *irq_dev = irq_desc[irq].chip_data; + struct irq_desc *desc = irq_to_desc(irq); + struct gsc_asic *irq_dev = desc->chip_data; int local_irq = gsc_find_local_irq(irq, irq_dev->global_irq, 32); u32 imr; @@ -159,12 +161,14 @@ static struct hw_interrupt_type gsc_asic_interrupt_type = { int gsc_assign_irq(struct hw_interrupt_type *type, void *data) { static int irq = GSC_IRQ_BASE; + struct irq_desc *desc; if (irq > GSC_IRQ_MAX) return NO_IRQ; - irq_desc[irq].chip = type; - irq_desc[irq].chip_data = data; + desc = irq_to_desc(irq); + desc->chip = type; + desc->chip_data = data; return irq++; } diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c index 6fb3f7979f2..7beffcab274 100644 --- a/drivers/parisc/iosapic.c +++ b/drivers/parisc/iosapic.c @@ -619,7 +619,9 @@ iosapic_set_irt_data( struct vector_info *vi, u32 *dp0, u32 *dp1) static struct vector_info *iosapic_get_vector(unsigned int irq) { - return irq_desc[irq].chip_data; + struct irq_desc *desc = irq_to_desc(irq); + + return desc->chip_data; } static void iosapic_disable_irq(unsigned int irq) diff --git a/drivers/parisc/superio.c b/drivers/parisc/superio.c index 1e8d2d17f04..1e93c837514 100644 --- a/drivers/parisc/superio.c +++ b/drivers/parisc/superio.c @@ -363,7 +363,9 @@ int superio_fixup_irq(struct pci_dev *pcidev) #endif for (i = 0; i < 16; i++) { - irq_desc[i].chip = &superio_interrupt_type; + struct irq_desc *desc = irq_to_desc(i); + + desc->chip = &superio_interrupt_type; } /* diff --git a/drivers/pcmcia/hd64465_ss.c b/drivers/pcmcia/hd64465_ss.c index 117dc12ab43..9ef69cdb318 100644 --- a/drivers/pcmcia/hd64465_ss.c +++ b/drivers/pcmcia/hd64465_ss.c @@ -233,15 +233,18 @@ static struct hw_interrupt_type hd64465_ss_irq_type = { */ static void hs_map_irq(hs_socket_t *sp, unsigned int irq) { + struct irq_desc *desc; + DPRINTK("hs_map_irq(sock=%d irq=%d)\n", sp->number, irq); if (irq >= HS_NUM_MAPPED_IRQS) return; + desc = irq_to_desc(irq); hs_mapped_irq[irq].sock = sp; /* insert ourselves as the irq controller */ - hs_mapped_irq[irq].old_handler = irq_desc[irq].chip; - irq_desc[irq].chip = &hd64465_ss_irq_type; + hs_mapped_irq[irq].old_handler = desc->chip; + desc->chip = &hd64465_ss_irq_type; } @@ -250,13 +253,16 @@ static void hs_map_irq(hs_socket_t *sp, unsigned int irq) */ static void hs_unmap_irq(hs_socket_t *sp, unsigned int irq) { + struct irq_desc *desc; + DPRINTK("hs_unmap_irq(sock=%d irq=%d)\n", sp->number, irq); if (irq >= HS_NUM_MAPPED_IRQS) return; + desc = irq_to_desc(irq); /* restore the original irq controller */ - irq_desc[irq].chip = hs_mapped_irq[irq].old_handler; + desc->chip = hs_mapped_irq[irq].old_handler; } /*============================================================*/ diff --git a/drivers/xen/events.c b/drivers/xen/events.c index ed8235187dc..56ace47f24d 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -125,7 +125,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) BUG_ON(irq == -1); #ifdef CONFIG_SMP - irq_desc[irq].affinity = cpumask_of_cpu(cpu); + irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu); #endif __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); @@ -139,8 +139,10 @@ static void init_evtchn_cpu_bindings(void) #ifdef CONFIG_SMP int i; /* By default all event channels notify CPU#0. */ - for (i = 0; i < nr_irqs; i++) - irq_desc[i].affinity = cpumask_of_cpu(0); + for (i = 0; i < nr_irqs; i++) { + struct irq_desc *desc = irq_to_desc(i); + desc->affinity = cpumask_of_cpu(0); + } #endif memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); diff --git a/include/linux/irq.h b/include/linux/irq.h index 5f4b013624d..80b8200f2ad 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -152,6 +152,10 @@ struct irq_chip { * @name: flow handler name for /proc/interrupts output */ struct irq_desc { + unsigned int irq; +#ifdef CONFIG_HAVE_SPARSE_IRQ + struct irq_desc *next; +#endif irq_flow_handler_t handle_irq; struct irq_chip *chip; struct msi_desc *msi_desc; @@ -179,9 +183,9 @@ struct irq_desc { const char *name; } ____cacheline_internodealigned_in_smp; -#ifdef CONFIG_HAVE_DYN_ARRAY -extern struct irq_desc *irq_desc; -#else +extern struct irq_desc *irq_to_desc(unsigned int irq); +#ifndef CONFIG_HAVE_DYN_ARRAY +/* could be removed if we get rid of all irq_desc reference */ extern struct irq_desc irq_desc[NR_IRQS]; #endif @@ -249,7 +253,10 @@ extern int no_irq_affinity; static inline int irq_balancing_disabled(unsigned int irq) { - return irq_desc[irq].status & IRQ_NO_BALANCING_MASK; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + return desc->status & IRQ_NO_BALANCING_MASK; } /* Handle irq action chains: */ @@ -281,7 +288,7 @@ extern unsigned int __do_IRQ(unsigned int irq); */ static inline void generic_handle_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); #ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ desc->handle_irq(irq, desc); @@ -325,7 +332,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, static inline void __set_irq_handler_unlocked(int irq, irq_flow_handler_t handler) { - irq_desc[irq].handle_irq = handler; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->handle_irq = handler; } /* @@ -359,7 +369,7 @@ extern void destroy_irq(unsigned int irq); /* Test to see if a driver has successfully requested an irq */ static inline int irq_has_action(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); return desc->action != NULL; } @@ -374,10 +384,10 @@ extern int set_irq_chip_data(unsigned int irq, void *data); extern int set_irq_type(unsigned int irq, unsigned int type); extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); -#define get_irq_chip(irq) (irq_desc[irq].chip) -#define get_irq_chip_data(irq) (irq_desc[irq].chip_data) -#define get_irq_data(irq) (irq_desc[irq].handler_data) -#define get_irq_msi(irq) (irq_desc[irq].msi_desc) +#define get_irq_chip(irq) (irq_to_desc(irq)->chip) +#define get_irq_chip_data(irq) (irq_to_desc(irq)->chip_data) +#define get_irq_data(irq) (irq_to_desc(irq)->handler_data) +#define get_irq_msi(irq) (irq_to_desc(irq)->msi_desc) #endif /* CONFIG_GENERIC_HARDIRQS */ diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index c689e9851a8..c45ab718cf0 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -39,7 +39,7 @@ unsigned long probe_irq_on(void) * flush such a longstanding irq before considering it as spurious. */ for (i = nr_irqs-1; i > 0; i--) { - desc = irq_desc + i; + desc = irq_to_desc(i); spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { @@ -69,7 +69,7 @@ unsigned long probe_irq_on(void) * happened in the previous stage, it may have masked itself) */ for (i = nr_irqs-1; i > 0; i--) { - desc = irq_desc + i; + desc = irq_to_desc(i); spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { @@ -92,7 +92,7 @@ unsigned long probe_irq_on(void) for (i = 0; i < nr_irqs; i++) { unsigned int status; - desc = irq_desc + i; + desc = irq_to_desc(i); spin_lock_irq(&desc->lock); status = desc->status; @@ -131,7 +131,7 @@ unsigned int probe_irq_mask(unsigned long val) mask = 0; for (i = 0; i < nr_irqs; i++) { - struct irq_desc *desc = irq_desc + i; + struct irq_desc *desc = irq_to_desc(i); unsigned int status; spin_lock_irq(&desc->lock); @@ -174,7 +174,7 @@ int probe_irq_off(unsigned long val) int i, irq_found = 0, nr_irqs = 0; for (i = 0; i < nr_irqs; i++) { - struct irq_desc *desc = irq_desc + i; + struct irq_desc *desc = irq_to_desc(i); unsigned int status; spin_lock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index bba66e09870..76c225cf4b2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -33,7 +33,7 @@ void dynamic_irq_init(unsigned int irq) } /* Ensure we don't have left over values from a previous use of this irq */ - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->status = IRQ_DISABLED; desc->chip = &no_irq_chip; @@ -65,7 +65,7 @@ void dynamic_irq_cleanup(unsigned int irq) return; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); if (desc->action) { spin_unlock_irqrestore(&desc->lock, flags); @@ -100,7 +100,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) if (!chip) chip = &no_irq_chip; - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); desc->chip = chip; @@ -126,7 +126,7 @@ int set_irq_type(unsigned int irq, unsigned int type) return -ENODEV; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); if (type == IRQ_TYPE_NONE) return 0; @@ -155,7 +155,7 @@ int set_irq_data(unsigned int irq, void *data) return -EINVAL; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->handler_data = data; spin_unlock_irqrestore(&desc->lock, flags); @@ -180,7 +180,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) "Trying to install msi data for IRQ%d\n", irq); return -EINVAL; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->msi_desc = entry; if (entry) @@ -198,9 +198,10 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) */ int set_irq_chip_data(unsigned int irq, void *data) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc; unsigned long flags; + desc = irq_to_desc(irq); if (irq >= nr_irqs || !desc->chip) { printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); return -EINVAL; @@ -219,8 +220,9 @@ EXPORT_SYMBOL(set_irq_chip_data); */ static void default_enable(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc; + desc = irq_to_desc(irq); desc->chip->unmask(irq); desc->status &= ~IRQ_MASKED; } @@ -237,7 +239,10 @@ static void default_disable(unsigned int irq) */ static unsigned int default_startup(unsigned int irq) { - irq_desc[irq].chip->enable(irq); + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->chip->enable(irq); return 0; } @@ -247,8 +252,9 @@ static unsigned int default_startup(unsigned int irq) */ static void default_shutdown(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc; + desc = irq_to_desc(irq); desc->chip->mask(irq); desc->status |= IRQ_MASKED; } @@ -551,7 +557,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, return; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); if (!handle) handle = handle_bad_irq; @@ -616,7 +622,7 @@ void __init set_irq_noprobe(unsigned int irq) return; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_NOPROBE; @@ -634,7 +640,7 @@ void __init set_irq_probe(unsigned int irq) return; } - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); desc->status &= ~IRQ_NOPROBE; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 6ce3bcc2b8f..9fc33b3378e 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -18,6 +18,14 @@ #include "internals.h" +#ifdef CONFIG_TRACE_IRQFLAGS + +/* + * lockdep: we want to handle all irq_desc locks as a single lock-class: + */ +static struct lock_class_key irq_desc_lock_class; +#endif + /** * handle_bad_irq - handle spurious and unhandled irqs * @irq: the interrupt number @@ -51,7 +59,8 @@ int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); #ifdef CONFIG_HAVE_DYN_ARRAY -static struct irq_desc irq_desc_init __initdata = { +static struct irq_desc irq_desc_init = { + .irq = -1U, .status = IRQ_DISABLED, .chip = &no_irq_chip, .handle_irq = handle_bad_irq, @@ -62,6 +71,27 @@ static struct irq_desc irq_desc_init __initdata = { #endif }; + +static void init_one_irq_desc(struct irq_desc *desc) +{ + memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); +#ifdef CONFIG_TRACE_IRQFLAGS + lockdep_set_class(&desc->lock, &irq_desc_lock_class); +#endif +} + +#ifdef CONFIG_HAVE_SPARSE_IRQ +static int nr_irq_desc = 32; + +static int __init parse_nr_irq_desc(char *arg) +{ + if (arg) + nr_irq_desc = simple_strtoul(arg, NULL, 0); + return 0; +} + +early_param("nr_irq_desc", parse_nr_irq_desc); + static void __init init_work(void *data) { struct dyn_array *da = data; @@ -71,12 +101,83 @@ static void __init init_work(void *data) desc = *da->name; for (i = 0; i < *da->nr; i++) - memcpy(&desc[i], &irq_desc_init, sizeof(struct irq_desc)); + init_one_irq_desc(&desc[i]); + + for (i = 1; i < *da->nr; i++) + desc[i-1].next = &desc[i]; } -struct irq_desc *irq_desc; +static struct irq_desc *sparse_irqs; +DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work); + +extern int after_bootmem; +extern void *__alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal); +struct irq_desc *irq_to_desc(unsigned int irq) +{ + struct irq_desc *desc, *desc_pri; + int i; + int count = 0; + + BUG_ON(irq == -1U); + + desc_pri = desc = &sparse_irqs[0]; + while (desc) { + if (desc->irq == irq) + return desc; + + if (desc->irq == -1U) { + desc->irq = irq; + return desc; + } + desc_pri = desc; + desc = desc->next; + count++; + } + + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc); + + if (after_bootmem) + desc = kzalloc(sizeof(struct irq_desc)*nr_irq_desc, GFP_ATOMIC); + else + desc = __alloc_bootmem_nopanic(sizeof(struct irq_desc)*nr_irq_desc, PAGE_SIZE, 0); + + if (!desc) + panic("please boot with nr_irq_desc= %d\n", count * 2); + + for (i = 0; i < nr_irq_desc; i++) + init_one_irq_desc(&desc[i]); + + for (i = 1; i < nr_irq_desc; i++) + desc[i-1].next = &desc[i]; + + desc->irq = irq; + desc_pri->next = desc; + + return desc; +} +#else +static void __init init_work(void *data) +{ + struct dyn_array *da = data; + int i; + struct irq_desc *desc; + + desc = *da->name; + + for (i = 0; i < *da->nr; i++) + init_one_irq_desc(&desc[i]); + +} +static struct irq_desc *irq_desc; DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work); +#endif + #else struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { @@ -85,12 +186,23 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { .chip = &no_irq_chip, .handle_irq = handle_bad_irq, .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), + .lock = __SPIN_LOCK_UNLOCKED(sparse_irqs->lock), #ifdef CONFIG_SMP .affinity = CPU_MASK_ALL #endif } }; + +#endif + +#ifndef CONFIG_HAVE_SPARSE_IRQ +struct irq_desc *irq_to_desc(unsigned int irq) +{ + if (irq < nr_irqs) + return &irq_desc[irq]; + + return NULL; +} #endif /* @@ -99,7 +211,10 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { */ static void ack_bad(unsigned int irq) { - print_irq_desc(irq, irq_desc + irq); + struct irq_desc *desc; + + desc = irq_to_desc(irq); + print_irq_desc(irq, desc); ack_bad_irq(irq); } @@ -196,7 +311,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) */ unsigned int __do_IRQ(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; unsigned int status; @@ -287,19 +402,16 @@ out: } #endif -#ifdef CONFIG_TRACE_IRQFLAGS - -/* - * lockdep: we want to handle all irq_desc locks as a single lock-class: - */ -static struct lock_class_key irq_desc_lock_class; +#ifdef CONFIG_TRACE_IRQFLAGS void early_init_irq_lock_class(void) { +#ifndef CONFIG_HAVE_DYN_ARRAY int i; for (i = 0; i < nr_irqs; i++) lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); +#endif } - #endif + diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d5a4333d8f1..b5943e9f95a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -31,7 +31,7 @@ cpumask_t irq_default_affinity = CPU_MASK_ALL; */ void synchronize_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned int status; if (irq >= nr_irqs) @@ -64,7 +64,7 @@ EXPORT_SYMBOL(synchronize_irq); */ int irq_can_set_affinity(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || !desc->chip->set_affinity) @@ -81,7 +81,7 @@ int irq_can_set_affinity(unsigned int irq) */ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (!desc->chip->set_affinity) return -EINVAL; @@ -111,14 +111,16 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) int irq_select_affinity(unsigned int irq) { cpumask_t mask; + struct irq_desc *desc; if (!irq_can_set_affinity(irq)) return 0; cpus_and(mask, cpu_online_map, irq_default_affinity); - irq_desc[irq].affinity = mask; - irq_desc[irq].chip->set_affinity(irq, mask); + desc = irq_to_desc(irq); + desc->affinity = mask; + desc->chip->set_affinity(irq, mask); set_balance_irq_affinity(irq, mask); return 0; @@ -140,7 +142,7 @@ int irq_select_affinity(unsigned int irq) */ void disable_irq_nosync(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; if (irq >= nr_irqs) @@ -169,7 +171,7 @@ EXPORT_SYMBOL(disable_irq_nosync); */ void disable_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (irq >= nr_irqs) return; @@ -211,7 +213,7 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq) */ void enable_irq(unsigned int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; if (irq >= nr_irqs) @@ -225,7 +227,7 @@ EXPORT_SYMBOL(enable_irq); static int set_irq_wake_real(unsigned int irq, unsigned int on) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; if (desc->chip->set_wake) @@ -248,7 +250,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) */ int set_irq_wake(unsigned int irq, unsigned int on) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; int ret = 0; @@ -288,12 +290,13 @@ EXPORT_SYMBOL(set_irq_wake); */ int can_request_irq(unsigned int irq, unsigned long irqflags) { + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; - if (irq >= nr_irqs || irq_desc[irq].status & IRQ_NOREQUEST) + if (irq >= nr_irqs || desc->status & IRQ_NOREQUEST) return 0; - action = irq_desc[irq].action; + action = desc->action; if (action) if (irqflags & action->flags & IRQF_SHARED) action = NULL; @@ -349,7 +352,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, */ int setup_irq(unsigned int irq, struct irqaction *new) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *old, **p; const char *old_name = NULL; unsigned long flags; @@ -518,7 +521,7 @@ void free_irq(unsigned int irq, void *dev_id) if (irq >= nr_irqs) return; - desc = irq_desc + irq; + desc = irq_to_desc(irq); spin_lock_irqsave(&desc->lock, flags); p = &desc->action; for (;;) { @@ -615,6 +618,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, { struct irqaction *action; int retval; + struct irq_desc *desc; #ifdef CONFIG_LOCKDEP /* @@ -632,7 +636,8 @@ int request_irq(unsigned int irq, irq_handler_t handler, return -EINVAL; if (irq >= nr_irqs) return -EINVAL; - if (irq_desc[irq].status & IRQ_NOREQUEST) + desc = irq_to_desc(irq); + if (desc->status & IRQ_NOREQUEST) return -EINVAL; if (!handler) return -EINVAL; diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 77b7acc875c..90b920d3f52 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -3,18 +3,18 @@ void set_pending_irq(unsigned int irq, cpumask_t mask) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; spin_lock_irqsave(&desc->lock, flags); desc->status |= IRQ_MOVE_PENDING; - irq_desc[irq].pending_mask = mask; + desc->pending_mask = mask; spin_unlock_irqrestore(&desc->lock, flags); } void move_masked_irq(int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); cpumask_t tmp; if (likely(!(desc->status & IRQ_MOVE_PENDING))) @@ -30,7 +30,7 @@ void move_masked_irq(int irq) desc->status &= ~IRQ_MOVE_PENDING; - if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) + if (unlikely(cpus_empty(desc->pending_mask))) return; if (!desc->chip->set_affinity) @@ -38,7 +38,7 @@ void move_masked_irq(int irq) assert_spin_locked(&desc->lock); - cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map); + cpus_and(tmp, desc->pending_mask, cpu_online_map); /* * If there was a valid mask to work with, please @@ -55,12 +55,12 @@ void move_masked_irq(int irq) if (likely(!cpus_empty(tmp))) { desc->chip->set_affinity(irq,tmp); } - cpus_clear(irq_desc[irq].pending_mask); + cpus_clear(desc->pending_mask); } void move_native_irq(int irq) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index e5225a65a4f..c2f356c808f 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { - struct irq_desc *desc = irq_desc + (long)m->private; + struct irq_desc *desc = irq_to_desc((long)m->private); cpumask_t *mask = &desc->affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -43,7 +43,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, cpumask_t new_value; int err; - if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || + if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) return -EIO; @@ -132,20 +132,20 @@ static const struct file_operations default_affinity_proc_fops = { static int irq_spurious_read(char *page, char **start, off_t off, int count, int *eof, void *data) { - struct irq_desc *d = &irq_desc[(long) data]; + struct irq_desc *desc = irq_to_desc((long) data); return sprintf(page, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n", - d->irq_count, - d->irqs_unhandled, - jiffies_to_msecs(d->last_unhandled)); + desc->irq_count, + desc->irqs_unhandled, + jiffies_to_msecs(desc->last_unhandled)); } #define MAX_NAMELEN 128 static int name_unique(unsigned int irq, struct irqaction *new_action) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; unsigned long flags; int ret = 1; @@ -165,8 +165,9 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) void register_handler_proc(unsigned int irq, struct irqaction *action) { char name [MAX_NAMELEN]; + struct irq_desc *desc = irq_to_desc(irq); - if (!irq_desc[irq].dir || action->dir || !action->name || + if (!desc->dir || action->dir || !action->name || !name_unique(irq, action)) return; @@ -174,7 +175,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) snprintf(name, MAX_NAMELEN, "%s", action->name); /* create /proc/irq/1234/handler/ */ - action->dir = proc_mkdir(name, irq_desc[irq].dir); + action->dir = proc_mkdir(name, desc->dir); } #undef MAX_NAMELEN @@ -185,25 +186,24 @@ void register_irq_proc(unsigned int irq) { char name [MAX_NAMELEN]; struct proc_dir_entry *entry; + struct irq_desc *desc = irq_to_desc(irq); - if (!root_irq_dir || - (irq_desc[irq].chip == &no_irq_chip) || - irq_desc[irq].dir) + if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) return; memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ - irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); + desc->dir = proc_mkdir(name, root_irq_dir); #ifdef CONFIG_SMP /* create /proc/irq//smp_affinity */ - proc_create_data("smp_affinity", 0600, irq_desc[irq].dir, + proc_create_data("smp_affinity", 0600, desc->dir, &irq_affinity_proc_fops, (void *)(long)irq); #endif - entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); + entry = create_proc_entry("spurious", 0444, desc->dir); if (entry) { entry->data = (void *)(long)irq; entry->read_proc = irq_spurious_read; @@ -214,8 +214,10 @@ void register_irq_proc(unsigned int irq) void unregister_handler_proc(unsigned int irq, struct irqaction *action) { - if (action->dir) - remove_proc_entry(action->dir->name, irq_desc[irq].dir); + if (action->dir) { + struct irq_desc *desc = irq_to_desc(irq); + remove_proc_entry(action->dir->name, desc->dir); + } } void register_default_affinity_proc(void) diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index cba8aa5bc7f..89c7117acf2 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -36,7 +36,7 @@ static void resend_irqs(unsigned long arg) while (!bitmap_empty(irqs_resend, nr_irqs)) { irq = find_first_bit(irqs_resend, nr_irqs); clear_bit(irq, irqs_resend); - desc = irq_desc + irq; + desc = irq_to_desc(irq); local_irq_disable(); desc->handle_irq(irq, desc); local_irq_enable(); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index e26ca1e90c0..b5d906002e1 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -92,11 +92,12 @@ static int misrouted_irq(int irq) int ok = 0; for (i = 1; i < nr_irqs; i++) { - struct irq_desc *desc = irq_desc + i; + struct irq_desc *desc; if (i == irq) /* Already tried */ continue; + desc = irq_to_desc(i); if (try_one_irq(i, desc)) ok = 1; } @@ -108,7 +109,7 @@ static void poll_spurious_irqs(unsigned long dummy) { int i; for (i = 1; i < nr_irqs; i++) { - struct irq_desc *desc = irq_desc + i; + struct irq_desc *desc = irq_to_desc(i); unsigned int status; /* Racy but it doesn't matter */ -- cgit v1.2.3-70-g09d2 From 8b8e8c1bf7275eca859fe551dfa484134eaf013b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:23 -0700 Subject: x86: remove irqbalance in kernel for 32 bit This has been deprecated for years, the user space irqbalanced utility works better with numa, has configurable policies, etc... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 8 - arch/x86/configs/i386_defconfig | 1 - arch/x86/kernel/io_apic_32.c | 402 ---------------------------------------- arch/x86/kernel/quirks.c | 3 - include/linux/irq.h | 14 +- kernel/irq/manage.c | 3 - 6 files changed, 3 insertions(+), 428 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1004888e9b1..3e0eaaa1a33 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1254,14 +1254,6 @@ config EFI resultant kernel should continue to boot on existing non-EFI platforms. -config IRQBALANCE - def_bool y - prompt "Enable kernel irq balancing" - depends on X86_32 && SMP && X86_IO_APIC - help - The default yes will allow the kernel to do irq load balancing. - Saying no will keep the kernel from doing irq load balancing. - config SECCOMP def_bool y prompt "Enable seccomp to safely compute untrusted bytecode" diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 52d0359719d..13b8c86ae98 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -287,7 +287,6 @@ CONFIG_MTRR=y # CONFIG_MTRR_SANITIZER is not set CONFIG_X86_PAT=y CONFIG_EFI=y -# CONFIG_IRQBALANCE is not set CONFIG_SECCOMP=y # CONFIG_HZ_100 is not set # CONFIG_HZ_250 is not set diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 204884b1415..668edf22606 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -371,408 +371,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) spin_unlock_irqrestore(&ioapic_lock, flags); } -#if defined(CONFIG_IRQBALANCE) -# include /* kernel_thread() */ -# include /* kstat */ -# include /* kmalloc() */ -# include - -#define IRQBALANCE_CHECK_ARCH -999 -#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) -#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) -#define BALANCED_IRQ_MORE_DELTA (HZ/10) -#define BALANCED_IRQ_LESS_DELTA (HZ) - -static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; -static int physical_balance __read_mostly; -static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; - -static struct irq_cpu_info { - unsigned long *last_irq; - unsigned long *irq_delta; - unsigned long irq; -} irq_cpu_data[NR_CPUS]; - -#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) -#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq]) -#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq]) - -#define IDLE_ENOUGH(cpu,now) \ - (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) - -#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) - -#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i))) - -static cpumask_t balance_irq_affinity_init __initdata = CPU_MASK_ALL; - -static cpumask_t *balance_irq_affinity; - - -static void __init irq_affinity_init_work(void *data) -{ - struct dyn_array *da = data; - - int i; - struct balance_irq_affinity *affinity; - - affinity = *da->name; - - for (i = 0; i < *da->nr; i++) - memcpy(&affinity[i], &balance_irq_affinity_init, - sizeof(struct balance_irq_affinity)); - -} - -DEFINE_DYN_ARRAY(balance_irq_affinity, sizeof(struct balance_irq_affinity), nr_irqs, PAGE_SIZE, irq_affinity_init_work); - - -void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) -{ - balance_irq_affinity[irq] = mask; -} - -static unsigned long move(int curr_cpu, cpumask_t allowed_mask, - unsigned long now, int direction) -{ - int search_idle = 1; - int cpu = curr_cpu; - - goto inside; - - do { - if (unlikely(cpu == curr_cpu)) - search_idle = 0; -inside: - if (direction == 1) { - cpu++; - if (cpu >= NR_CPUS) - cpu = 0; - } else { - cpu--; - if (cpu == -1) - cpu = NR_CPUS-1; - } - } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) || - (search_idle && !IDLE_ENOUGH(cpu, now))); - - return cpu; -} - -static inline void balance_irq(int cpu, int irq) -{ - unsigned long now = jiffies; - cpumask_t allowed_mask; - unsigned int new_cpu; - - if (irqbalance_disabled) - return; - - cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); - new_cpu = move(cpu, allowed_mask, now, 1); - if (cpu != new_cpu) - set_pending_irq(irq, cpumask_of_cpu(new_cpu)); -} - -static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) -{ - int i, j; - struct irq_desc *desc; - - for_each_online_cpu(i) { - for (j = 0; j < nr_irqs; j++) { - desc = irq_to_desc(j); - if (!desc->action) - continue; - /* Is it a significant load ? */ - if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) < - useful_load_threshold) - continue; - balance_irq(i, j); - } - } - balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); - return; -} - -static void do_irq_balance(void) -{ - int i, j; - unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); - unsigned long move_this_load = 0; - int max_loaded = 0, min_loaded = 0; - int load; - unsigned long useful_load_threshold = balanced_irq_interval + 10; - int selected_irq; - int tmp_loaded, first_attempt = 1; - unsigned long tmp_cpu_irq; - unsigned long imbalance = 0; - cpumask_t allowed_mask, target_cpu_mask, tmp; - struct irq_desc *desc; - - for_each_possible_cpu(i) { - int package_index; - CPU_IRQ(i) = 0; - if (!cpu_online(i)) - continue; - package_index = CPU_TO_PACKAGEINDEX(i); - for (j = 0; j < nr_irqs; j++) { - unsigned long value_now, delta; - /* Is this an active IRQ or balancing disabled ? */ - desc = irq_to_desc(j); - if (!desc->action || irq_balancing_disabled(j)) - continue; - if (package_index == i) - IRQ_DELTA(package_index, j) = 0; - /* Determine the total count per processor per IRQ */ - value_now = (unsigned long) kstat_irqs_cpu(j, i); - - /* Determine the activity per processor per IRQ */ - delta = value_now - LAST_CPU_IRQ(i, j); - - /* Update last_cpu_irq[][] for the next time */ - LAST_CPU_IRQ(i, j) = value_now; - - /* Ignore IRQs whose rate is less than the clock */ - if (delta < useful_load_threshold) - continue; - /* update the load for the processor or package total */ - IRQ_DELTA(package_index, j) += delta; - - /* Keep track of the higher numbered sibling as well */ - if (i != package_index) - CPU_IRQ(i) += delta; - /* - * We have sibling A and sibling B in the package - * - * cpu_irq[A] = load for cpu A + load for cpu B - * cpu_irq[B] = load for cpu B - */ - CPU_IRQ(package_index) += delta; - } - } - /* Find the least loaded processor package */ - for_each_online_cpu(i) { - if (i != CPU_TO_PACKAGEINDEX(i)) - continue; - if (min_cpu_irq > CPU_IRQ(i)) { - min_cpu_irq = CPU_IRQ(i); - min_loaded = i; - } - } - max_cpu_irq = ULONG_MAX; - -tryanothercpu: - /* - * Look for heaviest loaded processor. - * We may come back to get the next heaviest loaded processor. - * Skip processors with trivial loads. - */ - tmp_cpu_irq = 0; - tmp_loaded = -1; - for_each_online_cpu(i) { - if (i != CPU_TO_PACKAGEINDEX(i)) - continue; - if (max_cpu_irq <= CPU_IRQ(i)) - continue; - if (tmp_cpu_irq < CPU_IRQ(i)) { - tmp_cpu_irq = CPU_IRQ(i); - tmp_loaded = i; - } - } - - if (tmp_loaded == -1) { - /* - * In the case of small number of heavy interrupt sources, - * loading some of the cpus too much. We use Ingo's original - * approach to rotate them around. - */ - if (!first_attempt && imbalance >= useful_load_threshold) { - rotate_irqs_among_cpus(useful_load_threshold); - return; - } - goto not_worth_the_effort; - } - - first_attempt = 0; /* heaviest search */ - max_cpu_irq = tmp_cpu_irq; /* load */ - max_loaded = tmp_loaded; /* processor */ - imbalance = (max_cpu_irq - min_cpu_irq) / 2; - - /* - * if imbalance is less than approx 10% of max load, then - * observe diminishing returns action. - quit - */ - if (imbalance < (max_cpu_irq >> 3)) - goto not_worth_the_effort; - -tryanotherirq: - /* if we select an IRQ to move that can't go where we want, then - * see if there is another one to try. - */ - move_this_load = 0; - selected_irq = -1; - for (j = 0; j < nr_irqs; j++) { - /* Is this an active IRQ? */ - desc = irq_to_desc(j); - if (!desc->action) - continue; - if (imbalance <= IRQ_DELTA(max_loaded, j)) - continue; - /* Try to find the IRQ that is closest to the imbalance - * without going over. - */ - if (move_this_load < IRQ_DELTA(max_loaded, j)) { - move_this_load = IRQ_DELTA(max_loaded, j); - selected_irq = j; - } - } - if (selected_irq == -1) - goto tryanothercpu; - - imbalance = move_this_load; - - /* For physical_balance case, we accumulated both load - * values in the one of the siblings cpu_irq[], - * to use the same code for physical and logical processors - * as much as possible. - * - * NOTE: the cpu_irq[] array holds the sum of the load for - * sibling A and sibling B in the slot for the lowest numbered - * sibling (A), _AND_ the load for sibling B in the slot for - * the higher numbered sibling. - * - * We seek the least loaded sibling by making the comparison - * (A+B)/2 vs B - */ - load = CPU_IRQ(min_loaded) >> 1; - for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) { - if (load > CPU_IRQ(j)) { - /* This won't change cpu_sibling_map[min_loaded] */ - load = CPU_IRQ(j); - min_loaded = j; - } - } - - cpus_and(allowed_mask, - cpu_online_map, - balance_irq_affinity[selected_irq]); - target_cpu_mask = cpumask_of_cpu(min_loaded); - cpus_and(tmp, target_cpu_mask, allowed_mask); - - if (!cpus_empty(tmp)) { - /* mark for change destination */ - set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); - - /* Since we made a change, come back sooner to - * check for more variation. - */ - balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); - return; - } - goto tryanotherirq; - -not_worth_the_effort: - /* - * if we did not find an IRQ to move, then adjust the time interval - * upward - */ - balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, - balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); - return; -} - -static int balanced_irq(void *unused) -{ - int i; - unsigned long prev_balance_time = jiffies; - long time_remaining = balanced_irq_interval; - struct irq_desc *desc; - - /* push everything to CPU 0 to give us a starting point. */ - for (i = 0 ; i < nr_irqs ; i++) { - desc = irq_to_desc(i); - desc->pending_mask = cpumask_of_cpu(0); - set_pending_irq(i, cpumask_of_cpu(0)); - } - - set_freezable(); - for ( ; ; ) { - time_remaining = schedule_timeout_interruptible(time_remaining); - try_to_freeze(); - if (time_after(jiffies, - prev_balance_time+balanced_irq_interval)) { - preempt_disable(); - do_irq_balance(); - prev_balance_time = jiffies; - time_remaining = balanced_irq_interval; - preempt_enable(); - } - } - return 0; -} - -static int __init balanced_irq_init(void) -{ - int i; - struct cpuinfo_x86 *c; - cpumask_t tmp; - - cpus_shift_right(tmp, cpu_online_map, 2); - c = &boot_cpu_data; - /* When not overwritten by the command line ask subarchitecture. */ - if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) - irqbalance_disabled = NO_BALANCE_IRQ; - if (irqbalance_disabled) - return 0; - - /* disable irqbalance completely if there is only one processor online */ - if (num_online_cpus() < 2) { - irqbalance_disabled = 1; - return 0; - } - /* - * Enable physical balance only if more than 1 physical processor - * is present - */ - if (smp_num_siblings > 1 && !cpus_empty(tmp)) - physical_balance = 1; - - for_each_online_cpu(i) { - irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * nr_irqs, GFP_KERNEL); - irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * nr_irqs, GFP_KERNEL); - if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { - printk(KERN_ERR "balanced_irq_init: out of memory"); - goto failed; - } - } - - printk(KERN_INFO "Starting balanced_irq\n"); - if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) - return 0; - printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); -failed: - for_each_possible_cpu(i) { - kfree(irq_cpu_data[i].irq_delta); - irq_cpu_data[i].irq_delta = NULL; - kfree(irq_cpu_data[i].last_irq); - irq_cpu_data[i].last_irq = NULL; - } - return 0; -} - -int __devinit irqbalance_disable(char *str) -{ - irqbalance_disabled = 1; - return 1; -} - -__setup("noirqbalance", irqbalance_disable); - -late_initcall(balanced_irq_init); -#endif /* CONFIG_IRQBALANCE */ #endif /* CONFIG_SMP */ #ifndef CONFIG_SMP diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index f6a11b9b1f9..67465ed8931 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -35,9 +35,6 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) if (!(word & (1 << 13))) { dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " "disabling irq balancing and affinity\n"); -#ifdef CONFIG_IRQBALANCE - irqbalance_disable(""); -#endif noirqdebug_setup(""); #ifdef CONFIG_PROC_FS no_irq_affinity = 1; diff --git a/include/linux/irq.h b/include/linux/irq.h index 704136138dc..2445d2b3d5d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -185,7 +185,7 @@ struct irq_desc { cpumask_t affinity; unsigned int cpu; #endif -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) +#ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_t pending_mask; #endif #ifdef CONFIG_PROC_FS @@ -241,13 +241,13 @@ extern int setup_irq(unsigned int irq, struct irqaction *new); #ifdef CONFIG_SMP -#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) +#ifdef CONFIG_GENERIC_PENDING_IRQ void set_pending_irq(unsigned int irq, cpumask_t mask); void move_native_irq(int irq); void move_masked_irq(int irq); -#else /* CONFIG_GENERIC_PENDING_IRQ || CONFIG_IRQBALANCE */ +#else /* CONFIG_GENERIC_PENDING_IRQ */ static inline void move_irq(int irq) { @@ -274,14 +274,6 @@ static inline void set_pending_irq(unsigned int irq, cpumask_t mask) #endif /* CONFIG_SMP */ -#ifdef CONFIG_IRQBALANCE -extern void set_balance_irq_affinity(unsigned int irq, cpumask_t mask); -#else -static inline void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) -{ -} -#endif - extern int no_irq_affinity; static inline int irq_balancing_disabled(unsigned int irq) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 6df49218632..ddc956861a5 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -86,8 +86,6 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) if (!desc->chip->set_affinity) return -EINVAL; - set_balance_irq_affinity(irq, cpumask); - #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT) { unsigned long flags; @@ -122,7 +120,6 @@ int irq_select_affinity(unsigned int irq) desc->affinity = mask; desc->chip->set_affinity(irq, mask); - set_balance_irq_affinity(irq, mask); return 0; } #endif -- cgit v1.2.3-70-g09d2 From 199751d715bba5b469ea22adadc68a4166bfa4f5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:27 -0700 Subject: x86: make 32 bit to use sparse_irq but actually irq still needs to be less than NR_IRQS, because interrupt[NR_IRQS] in entry.S. need to enable per_cpu vector... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/kernel/io_apic_32.c | 63 ++++++++++++++++++++++++++++++-------------- arch/x86/kernel/irq_32.c | 37 +++++++++++++++++++------- arch/x86/kernel/irqinit_32.c | 7 +++++ 4 files changed, 79 insertions(+), 30 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3e0eaaa1a33..b157e94637b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,7 +34,7 @@ config X86 select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_DYN_ARRAY - select HAVE_SPARSE_IRQ if X86_64 + select HAVE_SPARSE_IRQ config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 59d2e8a273e..66c0a91362a 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -595,7 +595,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) struct irq_pin_list *entry; unsigned int apicid_value; cpumask_t tmp; - struct irq_desc *desc; cfg = irq_cfg(irq); @@ -620,8 +619,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) break; entry = entry->next; } - desc = irq_to_desc(irq); - desc->affinity = cpumask; + irq_to_desc(irq)->affinity = cpumask; spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1055,8 +1053,6 @@ static int __assign_irq_vector(int irq) int vector, offset; struct irq_cfg *cfg; - BUG_ON((unsigned)irq >= nr_irqs); - cfg = irq_cfg(irq); if (cfg->vector > 0) return cfg->vector; @@ -1103,7 +1099,12 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger) { struct irq_desc *desc; - desc = irq_to_desc(irq); + /* first time to use this irq_desc */ + if (irq < 16) + desc = irq_to_desc(irq); + else + desc = irq_to_desc_alloc(irq); + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) { desc->status |= IRQ_LEVEL; @@ -2330,16 +2331,19 @@ device_initcall(ioapic_init_sysfs); /* * Dynamic irq allocate and deallocation */ -int create_irq(void) +unsigned int create_irq_nr(unsigned int irq_want) { /* Allocate an unused irq */ - int irq, new, vector = 0; + unsigned int irq, new, vector = 0; unsigned long flags; struct irq_cfg *cfg_new; - irq = -ENOSPC; + /* only can use bus/dev/fn.. when per_cpu vector is used */ + irq_want = nr_irqs - 1; + + irq = 0; spin_lock_irqsave(&vector_lock, flags); - for (new = (nr_irqs - 1); new >= 0; new--) { + for (new = (nr_irqs - 1); new > 0; new--) { if (platform_legacy_irq(new)) continue; cfg_new = irq_cfg(new); @@ -2354,13 +2358,18 @@ int create_irq(void) } spin_unlock_irqrestore(&vector_lock, flags); - if (irq >= 0) { + if (irq > 0) { set_intr_gate(vector, interrupt[irq]); dynamic_irq_init(irq); } return irq; } +int create_irq(void) +{ + return create_irq_nr(nr_irqs - 1); +} + void destroy_irq(unsigned int irq) { unsigned long flags; @@ -2415,7 +2424,6 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) unsigned int dest; cpumask_t tmp; int vector; - struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2435,8 +2443,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; + irq_to_desc(irq)->affinity = mask; } #endif /* CONFIG_SMP */ @@ -2455,13 +2462,31 @@ static struct irq_chip msi_chip = { .retrigger = ioapic_retrigger_irq, }; +static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) +{ + unsigned int irq; + + irq = dev->bus->number; + irq <<= 8; + irq |= dev->devfn; + irq <<= 12; + + return irq; +} + int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) { struct msi_msg msg; int irq, ret; - irq = create_irq(); - if (irq < 0) - return irq; + + unsigned int irq_want; + + irq_want = build_irq_for_pci_dev(dev) + 0x100; + + irq = create_irq_nr(irq_want); + + if (irq == 0) + return -1; ret = msi_compose_msg(dev, irq, &msg); if (ret < 0) { @@ -2510,7 +2535,6 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) { unsigned int dest; cpumask_t tmp; - struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2521,8 +2545,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) dest = cpu_mask_to_apicid(mask); target_ht_irq(irq, dest); - desc = irq_to_desc(irq); - desc->affinity = mask; + irq_to_desc(irq)->affinity = mask; } #endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 576c5df6cad..0a57e39159a 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -224,9 +224,10 @@ unsigned int do_IRQ(struct pt_regs *regs) struct pt_regs *old_regs; /* high bit used in ret_from_ code */ int overflow, irq = ~regs->orig_ax; - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; - if (unlikely((unsigned)irq >= nr_irqs)) { + desc = irq_to_desc(irq); + if (unlikely(!desc)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", __func__, irq); BUG(); @@ -263,6 +264,24 @@ int show_interrupts(struct seq_file *p, void *v) int i = *(loff_t *) v, j; struct irqaction * action; unsigned long flags; + unsigned int entries; + struct irq_desc *desc; + int tail = 0; + +#ifdef CONFIG_HAVE_SPARSE_IRQ + desc = (struct irq_desc *)v; + entries = -1U; + i = desc->irq; + if (!desc->next) + tail = 1; +#else + entries = nr_irqs - 1; + i = *(loff_t *) v; + if (i == nr_irqs) + tail = 1; + else + desc = irq_to_desc(i); +#endif if (i == 0) { seq_printf(p, " "); @@ -271,9 +290,8 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - if (i < nr_irqs) { + if (i <= entries) { unsigned any_count = 0; - struct irq_desc *desc = irq_to_desc(i); spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP @@ -285,7 +303,7 @@ int show_interrupts(struct seq_file *p, void *v) action = desc->action; if (!action && !any_count) goto skip; - seq_printf(p, "%3d: ",i); + seq_printf(p, "%#x: ",i); #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else @@ -304,7 +322,9 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: spin_unlock_irqrestore(&desc->lock, flags); - } else if (i == nr_irqs) { + } + + if (tail) { seq_printf(p, "NMI: "); for_each_online_cpu(j) seq_printf(p, "%10u ", nmi_count(j)); @@ -396,15 +416,14 @@ void fixup_irqs(cpumask_t map) { unsigned int irq; static int warned; + struct irq_desc *desc; - for (irq = 0; irq < nr_irqs; irq++) { + for_each_irq_desc(irq, desc) { cpumask_t mask; - struct irq_desc *desc; if (irq == 2) continue; - desc = irq_to_desc(irq); cpus_and(mask, desc->affinity, map); if (any_online_cpu(mask) == NR_CPUS) { printk("Breaking affinity for irq %i\n", irq); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 65c1c950770..ded09ac2642 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -69,6 +69,13 @@ void __init init_ISA_irqs (void) * 16 old-style INTA-cycle interrupts: */ for (i = 0; i < 16; i++) { + /* first time call this irq_desc */ + struct irq_desc *desc = irq_to_desc_alloc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; + set_irq_chip_and_handler_name(i, &i8259A_chip, handle_level_irq, "XT"); } -- cgit v1.2.3-70-g09d2 From 8f09cd20a24c5f13c571bc73ddcd47be0af3b70f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:51 -0700 Subject: x86: make HAVE_SPARSE_IRQ support selectable Ingo said sparse_irq is some intrusive. need to make it selectable to make it simple, remove irq_desc as parameter in some functions. (ack, eoi, set_affinity). may need to make member if irq_chip to take irq_desc, or struct irq later. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/Kconfig | 3 -- arch/x86/Kconfig | 12 +++++++- arch/x86/kernel/io_apic.c | 71 ++++++++++++++++++++++++++++++++--------------- arch/x86/kernel/irq_32.c | 2 +- arch/x86/kernel/irq_64.c | 2 +- 5 files changed, 62 insertions(+), 28 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/Kconfig b/arch/Kconfig index b3676224626..c8a7c2eb649 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -106,6 +106,3 @@ config HAVE_CLK config HAVE_DYN_ARRAY def_bool n -config HAVE_SPARSE_IRQ - def_bool n - diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b157e94637b..600584b7a49 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,7 +34,6 @@ config X86 select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_DYN_ARRAY - select HAVE_SPARSE_IRQ config ARCH_DEFCONFIG string @@ -238,6 +237,17 @@ config SMP If you don't know what to do here, say N. +config HAVE_SPARSE_IRQ + bool "Support sparse irq numbering" + depends on PCI_MSI || HT_IRQ + default y + help + This enables support for sparse irq, esp for msi/msi-x. the irq + number will be bus/dev/fn + 12bit. You may need if you have lots of + cards supports msi-x installed. + + If you don't know what to do here, say Y. + config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index bf0e66d7303..f853b667fa5 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -107,7 +107,9 @@ struct irq_cfg; struct irq_pin_list; struct irq_cfg { unsigned int irq; +#ifdef CONFIG_HAVE_SPARSE_IRQ struct irq_cfg *next; +#endif struct irq_pin_list *irq_2_pin; cpumask_t domain; cpumask_t old_domain; @@ -137,20 +139,6 @@ static struct irq_cfg irq_cfg_legacy[] __initdata = { }; static struct irq_cfg irq_cfg_init = { .irq = -1U, }; -/* need to be biger than size of irq_cfg_legacy */ -static int nr_irq_cfg = 32; - -static int __init parse_nr_irq_cfg(char *arg) -{ - if (arg) { - nr_irq_cfg = simple_strtoul(arg, NULL, 0); - if (nr_irq_cfg < 32) - nr_irq_cfg = 32; - } - return 0; -} - -early_param("nr_irq_cfg", parse_nr_irq_cfg); static void init_one_irq_cfg(struct irq_cfg *cfg) { @@ -158,7 +146,9 @@ static void init_one_irq_cfg(struct irq_cfg *cfg) } static struct irq_cfg *irq_cfgx; +#ifdef CONFIG_HAVE_SPARSE_IRQ static struct irq_cfg *irq_cfgx_free; +#endif static void __init init_work(void *data) { struct dyn_array *da = data; @@ -174,15 +164,34 @@ static void __init init_work(void *data) for (i = legacy_count; i < *da->nr; i++) init_one_irq_cfg(&cfg[i]); +#ifdef CONFIG_HAVE_SPARSE_IRQ for (i = 1; i < *da->nr; i++) cfg[i-1].next = &cfg[i]; irq_cfgx_free = &irq_cfgx[legacy_count]; irq_cfgx[legacy_count - 1].next = NULL; +#endif +} + +#ifdef CONFIG_HAVE_SPARSE_IRQ +/* need to be biger than size of irq_cfg_legacy */ +static int nr_irq_cfg = 32; + +static int __init parse_nr_irq_cfg(char *arg) +{ + if (arg) { + nr_irq_cfg = simple_strtoul(arg, NULL, 0); + if (nr_irq_cfg < 32) + nr_irq_cfg = 32; + } + return 0; } -#define for_each_irq_cfg(cfg) \ - for (cfg = irq_cfgx; cfg; cfg = cfg->next) +early_param("nr_irq_cfg", parse_nr_irq_cfg); + +#define for_each_irq_cfg(irqX, cfg) \ + for (cfg = irq_cfgx, irqX = cfg->irq; cfg; cfg = cfg->next, irqX = cfg ? cfg->irq : -1U) + DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); @@ -273,7 +282,26 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) #endif return cfg; } +#else + +#define for_each_irq_cfg(irq, cfg) \ + for (irq = 0, cfg = &irq_cfgx[irq]; irq < nr_irqs; irq++, cfg = &irq_cfgx[irq]) + +DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work); +struct irq_cfg *irq_cfg(unsigned int irq) +{ + if (irq < nr_irqs) + return &irq_cfgx[irq]; + + return NULL; +} +struct irq_cfg *irq_cfg_alloc(unsigned int irq) +{ + return irq_cfg(irq); +} + +#endif /* * This is performance-critical, we want to do it O(1) * @@ -1282,11 +1310,10 @@ void __setup_vector_irq(int cpu) struct irq_cfg *cfg; /* Mark the inuse vectors */ - for_each_irq_cfg(cfg) { + for_each_irq_cfg(irq, cfg) { if (!cpu_isset(cpu, cfg->domain)) continue; vector = cfg->vector; - irq = cfg->irq; per_cpu(vector_irq, cpu)[vector] = irq; } /* Mark the free vectors */ @@ -1563,6 +1590,7 @@ __apicdebuginit(void) print_IO_APIC(void) union IO_APIC_reg_03 reg_03; unsigned long flags; struct irq_cfg *cfg; + unsigned int irq; if (apic_verbosity == APIC_QUIET) return; @@ -1651,11 +1679,11 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for_each_irq_cfg(cfg) { + for_each_irq_cfg(irq, cfg) { struct irq_pin_list *entry = cfg->irq_2_pin; if (!entry) continue; - printk(KERN_DEBUG "IRQ%d ", cfg->irq); + printk(KERN_DEBUG "IRQ%d ", irq); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) @@ -2535,8 +2563,7 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for_each_irq_cfg(cfg) { - irq = cfg->irq; + for_each_irq_cfg(irq, cfg) { if (IO_APIC_IRQ(irq) && !cfg->vector) { /* * Hmm.. We don't have an entry for this, diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index cc929f2f84f..b2e1082cf5a 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -269,7 +269,7 @@ int show_interrupts(struct seq_file *p, void *v) struct irqaction * action; unsigned long flags; unsigned int entries; - struct irq_desc *desc; + struct irq_desc *desc = NULL; int tail = 0; #ifdef CONFIG_HAVE_SPARSE_IRQ diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 348a11168c2..c2fca89a3f7 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -74,7 +74,7 @@ int show_interrupts(struct seq_file *p, void *v) struct irqaction * action; unsigned long flags; unsigned int entries; - struct irq_desc *desc; + struct irq_desc *desc = NULL; int tail = 0; #ifdef CONFIG_HAVE_SPARSE_IRQ -- cgit v1.2.3-70-g09d2 From 3235e936c0cc3589309280b6f59e5096779adae3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 13:16:00 +0200 Subject: x86: remove sparse irq from Kconfig This code is not ready yet. Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 600584b7a49..8636ddf2f4a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -237,17 +237,6 @@ config SMP If you don't know what to do here, say N. -config HAVE_SPARSE_IRQ - bool "Support sparse irq numbering" - depends on PCI_MSI || HT_IRQ - default y - help - This enables support for sparse irq, esp for msi/msi-x. the irq - number will be bus/dev/fn + 12bit. You may need if you have lots of - cards supports msi-x installed. - - If you don't know what to do here, say Y. - config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER -- cgit v1.2.3-70-g09d2 From d6c88a507ef0b6afdb013cba4e7804ba7324d99a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 15:27:23 +0200 Subject: genirq: revert dynarray Revert the dynarray changes. They need more thought and polishing. Signed-off-by: Thomas Gleixner --- arch/Kconfig | 4 - arch/x86/Kconfig | 1 - arch/x86/kernel/io_apic.c | 199 ++++++++++++++------------------------ arch/x86/kernel/setup_percpu.c | 8 +- arch/x86/kernel/visws_quirks.c | 2 +- arch/x86/kernel/vmlinux_32.lds.S | 1 - arch/x86/kernel/vmlinux_64.lds.S | 2 - arch/x86/xen/spinlock.c | 2 +- drivers/char/random.c | 5 - drivers/pci/intr_remapping.c | 11 +-- include/asm-generic/vmlinux.lds.h | 13 --- include/linux/init.h | 43 -------- include/linux/irq.h | 15 --- include/linux/kernel_stat.h | 16 ++- init/Makefile | 2 +- init/dyn_array.c | 120 ----------------------- init/main.c | 11 +-- kernel/irq/chip.c | 30 +----- kernel/irq/handle.c | 114 ++-------------------- 19 files changed, 103 insertions(+), 496 deletions(-) delete mode 100644 init/dyn_array.c (limited to 'arch/x86/Kconfig') diff --git a/arch/Kconfig b/arch/Kconfig index c8a7c2eb649..071004d3a1b 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -102,7 +102,3 @@ config HAVE_CLK help The calls support software clock gating and thus are a key power management tool on many systems. - -config HAVE_DYN_ARRAY - def_bool n - diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8636ddf2f4a..8da6123a60d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -33,7 +33,6 @@ config X86 select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS - select HAVE_DYN_ARRAY config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index e03bc0f87ee..6f80dc2f137 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -107,7 +107,6 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -struct irq_cfg; struct irq_pin_list; struct irq_cfg { unsigned int irq; @@ -120,7 +119,7 @@ struct irq_cfg { }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -static struct irq_cfg irq_cfg_legacy[] __initdata = { +static struct irq_cfg irq_cfgx[NR_IRQS] = { [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, @@ -139,48 +138,26 @@ static struct irq_cfg irq_cfg_legacy[] __initdata = { [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, }; -static struct irq_cfg irq_cfg_init = { .irq = -1U, }; - -static void init_one_irq_cfg(struct irq_cfg *cfg) -{ - memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg)); -} - -static struct irq_cfg *irq_cfgx; - -static void __init init_work(void *data) -{ - struct dyn_array *da = data; - struct irq_cfg *cfg; - int legacy_count; - int i; - - cfg = *da->name; - - memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); - - legacy_count = ARRAY_SIZE(irq_cfg_legacy); - for (i = legacy_count; i < *da->nr; i++) - init_one_irq_cfg(&cfg[i]); -} - #define for_each_irq_cfg(irq, cfg) \ - for (irq = 0, cfg = &irq_cfgx[irq]; irq < nr_irqs; irq++, cfg = &irq_cfgx[irq]) + for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) -DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work); - -struct irq_cfg *irq_cfg(unsigned int irq) +static struct irq_cfg *irq_cfg(unsigned int irq) { - if (irq < nr_irqs) - return &irq_cfgx[irq]; - - return NULL; + return irq < nr_irqs ? irq_cfgx + irq : NULL; } -struct irq_cfg *irq_cfg_alloc(unsigned int irq) + +static struct irq_cfg *irq_cfg_alloc(unsigned int irq) { return irq_cfg(irq); } +/* + * Rough estimation of how many shared IRQs there are, can be changed + * anytime. + */ +#define MAX_PLUS_SHARED_IRQS NR_IRQS +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) + /* * This is performance-critical, we want to do it O(1) * @@ -193,59 +170,29 @@ struct irq_pin_list { struct irq_pin_list *next; }; -static struct irq_pin_list *irq_2_pin_head; -/* fill one page ? */ -static int nr_irq_2_pin = 0x100; +static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; static struct irq_pin_list *irq_2_pin_ptr; -static void __init irq_2_pin_init_work(void *data) + +static void __init irq_2_pin_init(void) { - struct dyn_array *da = data; - struct irq_pin_list *pin; + struct irq_pin_list *pin = irq_2_pin_head; int i; - pin = *da->name; - - for (i = 1; i < *da->nr; i++) + for (i = 1; i < PIN_MAP_SIZE; i++) pin[i-1].next = &pin[i]; irq_2_pin_ptr = &pin[0]; } -DEFINE_DYN_ARRAY(irq_2_pin_head, sizeof(struct irq_pin_list), nr_irq_2_pin, PAGE_SIZE, irq_2_pin_init_work); static struct irq_pin_list *get_one_free_irq_2_pin(void) { - struct irq_pin_list *pin; - int i; - - pin = irq_2_pin_ptr; - - if (pin) { - irq_2_pin_ptr = pin->next; - pin->next = NULL; - return pin; - } - - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_2_pin %d\n", nr_irq_2_pin); - - if (after_bootmem) - pin = kzalloc(sizeof(struct irq_pin_list)*nr_irq_2_pin, - GFP_ATOMIC); - else - pin = __alloc_bootmem_nopanic(sizeof(struct irq_pin_list) * - nr_irq_2_pin, PAGE_SIZE, 0); + struct irq_pin_list *pin = irq_2_pin_ptr; if (!pin) panic("can not get more irq_2_pin\n"); - for (i = 1; i < nr_irq_2_pin; i++) - pin[i-1].next = &pin[i]; - irq_2_pin_ptr = pin->next; pin->next = NULL; - return pin; } @@ -284,8 +231,9 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); - if (sis_apic_bug) - writel(reg, &io_apic->index); + + if (sis_apic_bug) + writel(reg, &io_apic->index); writel(value, &io_apic->data); } @@ -1044,11 +992,11 @@ static int pin_2_irq(int idx, int apic, int pin) while (i < apic) irq += nr_ioapic_registers[i++]; irq += pin; - /* + /* * For MPS mode, so far only needed by ES7000 platform */ - if (ioapic_renumber_irq) - irq = ioapic_renumber_irq(apic, irq); + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); } #ifdef CONFIG_X86_32 @@ -1232,19 +1180,19 @@ static struct irq_chip ir_ioapic_chip; #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) { - int apic, idx, pin; + int apic, idx, pin; - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } - } - /* + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* * nonexistent IRQs are edge default */ - return 0; + return 0; } #else static inline int IO_APIC_irq_trigger(int irq) @@ -1509,8 +1457,8 @@ __apicdebuginit(void) print_IO_APIC(void) reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(apic, 3); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); @@ -2089,9 +2037,9 @@ static int ioapic_retrigger_irq(unsigned int irq) #else static int ioapic_retrigger_irq(unsigned int irq) { - send_IPI_self(irq_cfg(irq)->vector); + send_IPI_self(irq_cfg(irq)->vector); - return 1; + return 1; } #endif @@ -2189,7 +2137,7 @@ static int migrate_irq_remapped_level(int irq) if (io_apic_level_ack_pending(irq)) { /* - * Interrupt in progress. Migrating irq now will change the + * Interrupt in progress. Migrating irq now will change the * vector information in the IO-APIC RTE and that will confuse * the EOI broadcast performed by cpu. * So, delay the irq migration to the next instance. @@ -2426,28 +2374,28 @@ static void ack_apic_level(unsigned int irq) } static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, + .set_affinity = set_ioapic_affinity_irq, #endif .retrigger = ioapic_retrigger_irq, }; #ifdef CONFIG_INTR_REMAP static struct irq_chip ir_ioapic_chip __read_mostly = { - .name = "IR-IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_x2apic_edge, - .eoi = ack_x2apic_level, + .name = "IR-IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_x2apic_edge, + .eoi = ack_x2apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ir_ioapic_affinity_irq, + .set_affinity = set_ir_ioapic_affinity_irq, #endif .retrigger = ioapic_retrigger_irq, }; @@ -2636,8 +2584,8 @@ static inline void __init check_timer(void) local_irq_save(flags); - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); /* * get/set the timer IRQ vector: @@ -2822,12 +2770,12 @@ void __init setup_IO_APIC(void) io_apic_irqs = ~PIC_IRQS; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - /* + /* * Set up IO-APIC IRQ routing. */ #ifdef CONFIG_X86_32 - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); #endif sync_Arb_IDs(); setup_IO_APIC_irqs(); @@ -2842,9 +2790,9 @@ void __init setup_IO_APIC(void) static int __init io_apic_bug_finalize(void) { - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; + if (sis_apic_bug == -1) + sis_apic_bug = 0; + return 0; } late_initcall(io_apic_bug_finalize); @@ -3199,7 +3147,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) if (index < 0) { printk(KERN_ERR "Unable to allocate %d IRTE for PCI %s\n", nvec, - pci_name(dev)); + pci_name(dev)); return -ENOSPC; } return index; @@ -3885,23 +3833,24 @@ static struct resource * __init ioapic_setup_resources(void) void __init ioapic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; struct resource *ioapic_res; + int i; + irq_2_pin_init(); ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mp_apicaddr; #ifdef CONFIG_X86_32 - if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " - "disabling IO/APIC support!\n"); - smp_found_config = 0; - skip_ioapic_setup = 1; - goto fake_ioapic_page; - } + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } #endif } else { #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2b7dab699e8..410c88f0bfe 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -140,7 +140,7 @@ static void __init setup_cpu_pda_map(void) */ void __init setup_per_cpu_areas(void) { - ssize_t size, old_size, da_size; + ssize_t size, old_size; char *ptr; int cpu; unsigned long align = 1; @@ -150,9 +150,8 @@ void __init setup_per_cpu_areas(void) /* Copy section for each CPU (we discard the original) */ old_size = PERCPU_ENOUGH_ROOM; - da_size = per_cpu_dyn_array_size(&align); align = max_t(unsigned long, PAGE_SIZE, align); - size = roundup(old_size + da_size, align); + size = roundup(old_size, align); printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", size); @@ -182,9 +181,6 @@ void __init setup_per_cpu_areas(void) #endif per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - - per_cpu_alloc_dyn_array(cpu, ptr + old_size); - } printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 817aa55a120..0c9667f0752 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -633,7 +633,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) /* * handle this 'virtual interrupt' as a Cobalt one now. */ - kstat_irqs_this_cpu(desc)++; + kstat_incr_irqs_this_cpu(realirq, desc); if (likely(desc->action != NULL)) handle_IRQ_event(realirq, desc->action); diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index c36007ab394..a9b8560adbc 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -145,7 +145,6 @@ SECTIONS *(.x86_cpu_dev.init) __x86_cpu_dev_end = .; } - DYN_ARRAY_INIT(8) SECURITY_INIT . = ALIGN(4); .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 30973dbac8c..3245ad72594 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -174,8 +174,6 @@ SECTIONS } __x86_cpu_dev_end = .; - DYN_ARRAY_INIT(8) - SECURITY_INIT . = ALIGN(8); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index bb6bc721b13..5601506f2dd 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -241,7 +241,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ - kstat_irqs_this_cpu(irq_to_desc(irq))++; + kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); out: raw_local_irq_restore(flags); diff --git a/drivers/char/random.c b/drivers/char/random.c index 9ce80213007..1137d297604 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -558,12 +558,7 @@ struct timer_rand_state { unsigned dont_count_entropy:1; }; -#ifdef CONFIG_HAVE_DYN_ARRAY -static struct timer_rand_state **irq_timer_state; -DEFINE_DYN_ARRAY(irq_timer_state, sizeof(struct timer_rand_state *), nr_irqs, PAGE_SIZE, NULL); -#else static struct timer_rand_state *irq_timer_state[NR_IRQS]; -#endif static struct timer_rand_state *get_timer_rand_state(unsigned int irq) { diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index 0f43b265eee..950769e8747 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -19,20 +19,13 @@ struct irq_2_iommu { u8 irte_mask; }; -#ifdef CONFIG_HAVE_DYN_ARRAY -static struct irq_2_iommu *irq_2_iommuX; -DEFINE_DYN_ARRAY(irq_2_iommuX, sizeof(struct irq_2_iommu), nr_irqs, PAGE_SIZE, NULL); -#else static struct irq_2_iommu irq_2_iommuX[NR_IRQS]; -#endif static struct irq_2_iommu *irq_2_iommu(unsigned int irq) { - if (irq < nr_irqs) - return &irq_2_iommuX[irq]; - - return NULL; + return (irq < nr_irqs) ?: irq_2_iommuX + irq : NULL; } + static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq) { return irq_2_iommu(irq); diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index c68eda9d9a9..7440a0dcedd 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -210,19 +210,6 @@ * All archs are supposed to use RO_DATA() */ #define RODATA RO_DATA(4096) -#define DYN_ARRAY_INIT(align) \ - . = ALIGN((align)); \ - .dyn_array.init : AT(ADDR(.dyn_array.init) - LOAD_OFFSET) { \ - VMLINUX_SYMBOL(__dyn_array_start) = .; \ - *(.dyn_array.init) \ - VMLINUX_SYMBOL(__dyn_array_end) = .; \ - } \ - . = ALIGN((align)); \ - .per_cpu_dyn_array.init : AT(ADDR(.per_cpu_dyn_array.init) - LOAD_OFFSET) { \ - VMLINUX_SYMBOL(__per_cpu_dyn_array_start) = .; \ - *(.per_cpu_dyn_array.init) \ - VMLINUX_SYMBOL(__per_cpu_dyn_array_end) = .; \ - } #define SECURITY_INIT \ .security_initcall.init : AT(ADDR(.security_initcall.init) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__security_initcall_start) = .; \ diff --git a/include/linux/init.h b/include/linux/init.h index 59fbb4aaba6..70ad53e1eab 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -247,49 +247,6 @@ struct obs_kernel_param { /* Relies on boot_command_line being set */ void __init parse_early_param(void); -struct dyn_array { - void **name; - unsigned long size; - unsigned int *nr; - unsigned long align; - void (*init_work)(void *); -}; -extern struct dyn_array *__dyn_array_start[], *__dyn_array_end[]; -extern struct dyn_array *__per_cpu_dyn_array_start[], *__per_cpu_dyn_array_end[]; - -#define DEFINE_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \ - static struct dyn_array __dyn_array_##nameX __initdata = \ - { .name = (void **)&(nameX),\ - .size = sizeX,\ - .nr = &(nrX),\ - .align = alignX,\ - .init_work = init_workX,\ - }; \ - static struct dyn_array *__dyn_array_ptr_##nameX __used \ - __attribute__((__section__(".dyn_array.init"))) = \ - &__dyn_array_##nameX - -#define DEFINE_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \ - DEFINE_DYN_ARRAY_ADDR(nameX, nameX, sizeX, nrX, alignX, init_workX) - -#define DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, addrX, sizeX, nrX, alignX, init_workX) \ - static struct dyn_array __per_cpu_dyn_array_##nameX __initdata = \ - { .name = (void **)&(addrX),\ - .size = sizeX,\ - .nr = &(nrX),\ - .align = alignX,\ - .init_work = init_workX,\ - }; \ - static struct dyn_array *__per_cpu_dyn_array_ptr_##nameX __used \ - __attribute__((__section__(".per_cpu_dyn_array.init"))) = \ - &__per_cpu_dyn_array_##nameX - -#define DEFINE_PER_CPU_DYN_ARRAY(nameX, sizeX, nrX, alignX, init_workX) \ - DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, nameX, nrX, alignX, init_workX) - -extern void pre_alloc_dyn_array(void); -extern unsigned long per_cpu_dyn_array_size(unsigned long *align); -extern void per_cpu_alloc_dyn_array(int cpu, char *ptr); #endif /* __ASSEMBLY__ */ /** diff --git a/include/linux/irq.h b/include/linux/irq.h index 3f33c779030..38bf89f2ade 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -139,8 +139,6 @@ struct irq_chip { const char *typename; }; -struct timer_rand_state; -struct irq_2_iommu; /** * struct irq_desc - interrupt descriptor * @@ -167,9 +165,6 @@ struct irq_2_iommu; */ struct irq_desc { unsigned int irq; -#ifdef CONFIG_HAVE_DYN_ARRAY - unsigned int *kstat_irqs; -#endif irq_flow_handler_t handle_irq; struct irq_chip *chip; struct msi_desc *msi_desc; @@ -198,23 +193,13 @@ struct irq_desc { } ____cacheline_internodealigned_in_smp; -#ifndef CONFIG_HAVE_DYN_ARRAY -/* could be removed if we get rid of all irq_desc reference */ extern struct irq_desc irq_desc[NR_IRQS]; -#else -extern struct irq_desc *irq_desc; -#endif static inline struct irq_desc *irq_to_desc(unsigned int irq) { return (irq < nr_irqs) ? irq_desc + irq : NULL; } -#ifdef CONFIG_HAVE_DYN_ARRAY -#define kstat_irqs_this_cpu(DESC) \ - ((DESC)->kstat_irqs[smp_processor_id()]) -#endif - /* * Migration helpers for obsolete names, they will go away: */ diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 21249d8c129..a9d0d360b77 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -28,9 +28,7 @@ struct cpu_usage_stat { struct kernel_stat { struct cpu_usage_stat cpustat; -#ifndef CONFIG_HAVE_DYN_ARRAY unsigned int irqs[NR_IRQS]; -#endif }; DECLARE_PER_CPU(struct kernel_stat, kstat); @@ -41,20 +39,18 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); -#ifndef CONFIG_HAVE_DYN_ARRAY -#define kstat_irqs_this_cpu(irq) \ - (kstat_this_cpu.irqs[irq]) -#endif +struct irq_desc; +static inline void kstat_incr_irqs_this_cpu(unsigned int irq, + struct irq_desc *desc) +{ + kstat_this_cpu.irqs[irq]++; +} -#ifndef CONFIG_HAVE_DYN_ARRAY static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { return kstat_cpu(cpu).irqs[irq]; } -#else -extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); -#endif /* * Number of interrupts per specific IRQ source, since bootup diff --git a/init/Makefile b/init/Makefile index dc5eeca6eb6..4a243df426f 100644 --- a/init/Makefile +++ b/init/Makefile @@ -2,7 +2,7 @@ # Makefile for the linux kernel. # -obj-y := main.o dyn_array.o version.o mounts.o +obj-y := main.o version.o mounts.o ifneq ($(CONFIG_BLK_DEV_INITRD),y) obj-y += noinitramfs.o else diff --git a/init/dyn_array.c b/init/dyn_array.c deleted file mode 100644 index c8d5e2a1858..00000000000 --- a/init/dyn_array.c +++ /dev/null @@ -1,120 +0,0 @@ -#include -#include -#include -#include -#include -#include - -void __init pre_alloc_dyn_array(void) -{ -#ifdef CONFIG_HAVE_DYN_ARRAY - unsigned long total_size = 0, size, phys; - unsigned long max_align = 1; - struct dyn_array **daa; - char *ptr; - - /* get the total size at first */ - for (daa = __dyn_array_start ; daa < __dyn_array_end; daa++) { - struct dyn_array *da = *daa; - - printk(KERN_DEBUG "dyn_array %pF size:%#lx nr:%d align:%#lx\n", - da->name, da->size, *da->nr, da->align); - size = da->size * (*da->nr); - total_size += roundup(size, da->align); - if (da->align > max_align) - max_align = da->align; - } - if (total_size) - printk(KERN_DEBUG "dyn_array total_size: %#lx\n", - total_size); - else - return; - - /* allocate them all together */ - max_align = max_t(unsigned long, max_align, PAGE_SIZE); - ptr = __alloc_bootmem(total_size, max_align, 0); - phys = virt_to_phys(ptr); - - for (daa = __dyn_array_start ; daa < __dyn_array_end; daa++) { - struct dyn_array *da = *daa; - - size = da->size * (*da->nr); - phys = roundup(phys, da->align); - printk(KERN_DEBUG "dyn_array %pF ==> [%#lx - %#lx]\n", - da->name, phys, phys + size); - *da->name = phys_to_virt(phys); - - phys += size; - - if (da->init_work) - da->init_work(da); - } -#else -#ifdef CONFIG_GENERIC_HARDIRQS - unsigned int i; - - for (i = 0; i < NR_IRQS; i++) - irq_desc[i].irq = i; -#endif -#endif -} - -unsigned long __init per_cpu_dyn_array_size(unsigned long *align) -{ - unsigned long total_size = 0; -#ifdef CONFIG_HAVE_DYN_ARRAY - unsigned long size; - struct dyn_array **daa; - unsigned max_align = 1; - - for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) { - struct dyn_array *da = *daa; - - printk(KERN_DEBUG "per_cpu_dyn_array %pF size:%#lx nr:%d align:%#lx\n", - da->name, da->size, *da->nr, da->align); - size = da->size * (*da->nr); - total_size += roundup(size, da->align); - if (da->align > max_align) - max_align = da->align; - } - if (total_size) { - printk(KERN_DEBUG "per_cpu_dyn_array total_size: %#lx\n", - total_size); - *align = max_align; - } -#endif - return total_size; -} - -#ifdef CONFIG_SMP -void __init per_cpu_alloc_dyn_array(int cpu, char *ptr) -{ -#ifdef CONFIG_HAVE_DYN_ARRAY - unsigned long size, phys; - struct dyn_array **daa; - unsigned long addr; - void **array; - - phys = virt_to_phys(ptr); - for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) { - struct dyn_array *da = *daa; - - size = da->size * (*da->nr); - phys = roundup(phys, da->align); - printk(KERN_DEBUG "per_cpu_dyn_array %pF ==> [%#lx - %#lx]\n", - da->name, phys, phys + size); - - addr = (unsigned long)da->name; - addr += per_cpu_offset(cpu); - array = (void **)addr; - *array = phys_to_virt(phys); - *da->name = *array; /* so init_work could use it directly */ - - phys += size; - - if (da->init_work) - da->init_work(da); - } -#endif -} -#endif diff --git a/init/main.c b/init/main.c index e81cf427d9c..27f6bf6108e 100644 --- a/init/main.c +++ b/init/main.c @@ -391,23 +391,17 @@ EXPORT_SYMBOL(__per_cpu_offset); static void __init setup_per_cpu_areas(void) { - unsigned long size, i, old_size; + unsigned long size, i; char *ptr; unsigned long nr_possible_cpus = num_possible_cpus(); - unsigned long align = 1; - unsigned da_size; /* Copy section for each CPU (we discard the original) */ - old_size = PERCPU_ENOUGH_ROOM; - da_size = per_cpu_dyn_array_size(&align); - align = max_t(unsigned long, PAGE_SIZE, align); - size = ALIGN(old_size + da_size, align); + size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); ptr = alloc_bootmem_pages(size * nr_possible_cpus); for_each_possible_cpu(i) { __per_cpu_offset[i] = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - per_cpu_alloc_dyn_array(i, ptr + old_size); ptr += size; } } @@ -573,7 +567,6 @@ asmlinkage void __init start_kernel(void) printk(KERN_NOTICE); printk(linux_banner); setup_arch(&command_line); - pre_alloc_dyn_array(); mm_init_owner(&init_mm, &init_task); setup_command_line(command_line); unwind_setup(); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e6f73dbfcc3..d96d6f687c4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -326,11 +326,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) @@ -371,11 +367,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available @@ -422,11 +414,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) goto out; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); /* * If its disabled or no action available @@ -490,11 +478,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) mask_ack_irq(desc, irq); goto out_unlock; } -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ desc->chip->ack(irq); @@ -549,11 +533,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { irqreturn_t action_ret; -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); if (desc->chip->ack) desc->chip->ack(irq); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index f837133cdfb..9fe86b3a60a 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -18,11 +18,6 @@ #include "internals.h" -/* - * lockdep: we want to handle all irq_desc locks as a single lock-class: - */ -static struct lock_class_key irq_desc_lock_class; - /** * handle_bad_irq - handle spurious and unhandled irqs * @irq: the interrupt number @@ -30,15 +25,10 @@ static struct lock_class_key irq_desc_lock_class; * * Handles spurious and unhandled IRQ's. It also prints a debugmessage. */ -void -handle_bad_irq(unsigned int irq, struct irq_desc *desc) +void handle_bad_irq(unsigned int irq, struct irq_desc *desc) { print_irq_desc(irq, desc); -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); ack_bad_irq(irq); } @@ -59,80 +49,6 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc) int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); -#ifdef CONFIG_HAVE_DYN_ARRAY -static struct irq_desc irq_desc_init = { - .irq = -1U, - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif -}; - - -static void init_one_irq_desc(struct irq_desc *desc) -{ - memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); - lockdep_set_class(&desc->lock, &irq_desc_lock_class); -} - -extern int after_bootmem; -extern void *__alloc_bootmem_nopanic(unsigned long size, - unsigned long align, - unsigned long goal); - -static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr) -{ - unsigned long bytes, total_bytes; - char *ptr; - int i; - unsigned long phys; - - /* Compute how many bytes we need per irq and allocate them */ - bytes = nr * sizeof(unsigned int); - total_bytes = bytes * nr_desc; - if (after_bootmem) - ptr = kzalloc(total_bytes, GFP_ATOMIC); - else - ptr = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - - if (!ptr) - panic(" can not allocate kstat_irqs\n"); - - phys = __pa(ptr); - printk(KERN_DEBUG "kstat_irqs ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - - for (i = 0; i < nr_desc; i++) { - desc[i].kstat_irqs = (unsigned int *)ptr; - ptr += bytes; - } -} - -static void __init init_work(void *data) -{ - struct dyn_array *da = data; - int i; - struct irq_desc *desc; - - desc = *da->name; - - for (i = 0; i < *da->nr; i++) { - init_one_irq_desc(&desc[i]); - desc[i].irq = i; - } - - /* init kstat_irqs, nr_cpu_ids is ready already */ - init_kstat_irqs(desc, *da->nr, nr_cpu_ids); -} - -struct irq_desc *irq_desc; -DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work); - -#else - struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, @@ -146,8 +62,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; -#endif - /* * What should we do if we get a hw irq event on an illegal vector? * Each architecture has to answer this themself. @@ -258,11 +172,8 @@ unsigned int __do_IRQ(unsigned int irq) struct irqaction *action; unsigned int status; -#ifdef CONFIG_HAVE_DYN_ARRAY - kstat_irqs_this_cpu(desc)++; -#else - kstat_irqs_this_cpu(irq)++; -#endif + kstat_incr_irqs_this_cpu(irq, desc); + if (CHECK_IRQ_PER_CPU(desc->status)) { irqreturn_t action_ret; @@ -351,23 +262,16 @@ out: #ifdef CONFIG_TRACE_IRQFLAGS +/* + * lockdep: we want to handle all irq_desc locks as a single lock-class: + */ +static struct lock_class_key irq_desc_lock_class; + void early_init_irq_lock_class(void) { -#ifndef CONFIG_HAVE_DYN_ARRAY int i; for (i = 0; i < nr_irqs; i++) lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); -#endif } #endif - -#ifdef CONFIG_HAVE_DYN_ARRAY -unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) -{ - struct irq_desc *desc = irq_to_desc(irq); - return desc->kstat_irqs[cpu]; -} -#endif -EXPORT_SYMBOL(kstat_irqs_cpu); - -- cgit v1.2.3-70-g09d2 From 9ba16087d9f996a93ab6f4453a52a4b24bc1f25c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 15 Oct 2008 22:01:38 -0700 Subject: Kconfig: eliminate "def_bool n" constructs Using "def_bool n" is pointless, simply using bool here appears more appropriate. Further, retaining such options that don't have a prompt and aren't selected by anything seems also at least questionable. Signed-off-by: Jan Beulich Cc: Ingo Molnar Cc: Tony Luck Cc: Thomas Gleixner Cc: Bartlomiej Zolnierkiewicz Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 18 +++++++++--------- arch/ia64/Kconfig | 8 -------- arch/x86/Kconfig | 23 +++++------------------ drivers/ide/Kconfig | 2 +- kernel/time/Kconfig | 1 - lib/Kconfig | 4 ++-- mm/Kconfig | 4 ++-- 7 files changed, 19 insertions(+), 41 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/Kconfig b/arch/Kconfig index 0267babe5eb..e6ab550bceb 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -28,7 +28,7 @@ config OPROFILE_IBS If unsure, say N. config HAVE_OPROFILE - def_bool n + bool config KPROBES bool "Kprobes" @@ -42,7 +42,7 @@ config KPROBES If in doubt, say "N". config HAVE_EFFICIENT_UNALIGNED_ACCESS - def_bool n + bool help Some architectures are unable to perform unaligned accesses without the use of get_unaligned/put_unaligned. Others are @@ -65,13 +65,13 @@ config KRETPROBES depends on KPROBES && HAVE_KRETPROBES config HAVE_IOREMAP_PROT - def_bool n + bool config HAVE_KPROBES - def_bool n + bool config HAVE_KRETPROBES - def_bool n + bool # # An arch should select this if it provides all these things: @@ -89,16 +89,16 @@ config HAVE_KRETPROBES # signal delivery calls tracehook_signal_handler() # config HAVE_ARCH_TRACEHOOK - def_bool n + bool config HAVE_DMA_ATTRS - def_bool n + bool config USE_GENERIC_SMP_HELPERS - def_bool n + bool config HAVE_CLK - def_bool n + bool help The calls support software clock gating and thus are a key power management tool on many systems. diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 48e496fe1e7..3b7aa38254a 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -60,14 +60,6 @@ config RWSEM_XCHGADD_ALGORITHM bool default y -config ARCH_HAS_ILOG2_U32 - bool - default n - -config ARCH_HAS_ILOG2_U64 - bool - default n - config HUGETLB_PAGE_SIZE_VARIABLE bool depends on HUGETLB_PAGE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f65c2744d57..7ccb6e60e60 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -39,10 +39,6 @@ config ARCH_DEFCONFIG default "arch/x86/configs/i386_defconfig" if X86_32 default "arch/x86/configs/x86_64_defconfig" if X86_64 - -config GENERIC_LOCKBREAK - def_bool n - config GENERIC_TIME def_bool y @@ -95,7 +91,7 @@ config GENERIC_HWEIGHT def_bool y config GENERIC_GPIO - def_bool n + bool config ARCH_MAY_HAVE_PC_FDC def_bool y @@ -106,12 +102,6 @@ config RWSEM_GENERIC_SPINLOCK config RWSEM_XCHGADD_ALGORITHM def_bool X86_XADD -config ARCH_HAS_ILOG2_U32 - def_bool n - -config ARCH_HAS_ILOG2_U64 - def_bool n - config ARCH_HAS_CPU_IDLE_WAIT def_bool y @@ -758,9 +748,8 @@ config I8K Say N otherwise. config X86_REBOOTFIXUPS - def_bool n - prompt "Enable X86 board specific fixups for reboot" - depends on X86_32 && X86 + bool "Enable X86 board specific fixups for reboot" + depends on X86_32 ---help--- This enables chipset and/or board specific fixups to be done in order to get reboot to work correctly. This is only needed on @@ -944,8 +933,7 @@ config HIGHMEM depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) config X86_PAE - def_bool n - prompt "PAE (Physical Address Extension) Support" + bool "PAE (Physical Address Extension) Support" depends on X86_32 && !HIGHMEM4G select RESOURCES_64BIT help @@ -1238,8 +1226,7 @@ config X86_PAT If unsure, say Y. config EFI - def_bool n - prompt "EFI runtime service support" + bool "EFI runtime service support" depends on ACPI ---help--- This enables the kernel to use EFI runtime services that are diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig index 6c6dd2faced..74a369a6116 100644 --- a/drivers/ide/Kconfig +++ b/drivers/ide/Kconfig @@ -4,7 +4,7 @@ # Select HAVE_IDE if IDE is supported config HAVE_IDE - def_bool n + bool menuconfig IDE tristate "ATA/ATAPI/MFM/RLL support" diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 8d53106a0a9..95ed42951e0 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -3,7 +3,6 @@ # config TICK_ONESHOT bool - default n config NO_HZ bool "Tickless System (Dynamic Ticks)" diff --git a/lib/Kconfig b/lib/Kconfig index c7ad7a5b353..85cf7ea978a 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -8,10 +8,10 @@ config BITREVERSE tristate config GENERIC_FIND_FIRST_BIT - def_bool n + bool config GENERIC_FIND_NEXT_BIT - def_bool n + bool config CRC_CCITT tristate "CRC-CCITT functions" diff --git a/mm/Kconfig b/mm/Kconfig index 0bd9c2dbb2a..5585f129359 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -101,7 +101,7 @@ config HAVE_MEMORY_PRESENT # with gcc 3.4 and later. # config SPARSEMEM_STATIC - def_bool n + bool # # Architecture platforms which require a two level mem_section in SPARSEMEM @@ -113,7 +113,7 @@ config SPARSEMEM_EXTREME depends on SPARSEMEM && !SPARSEMEM_STATIC config SPARSEMEM_VMEMMAP_ENABLE - def_bool n + bool config SPARSEMEM_VMEMMAP bool "Sparse Memory virtual memmap" -- cgit v1.2.3-70-g09d2 From 89cedfefca1d446ee2598fd3bcbb23ee3802e26a Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Thu, 16 Oct 2008 19:00:08 -0400 Subject: cpuidle: upon BIOS bug, default to default_idle rather than polling http://bugzilla.kernel.org/show_bug.cgi?id=11345 Signed-off-by: Venkatesh Pallipadi Signed-off-by: Len Brown --- arch/x86/Kconfig | 3 +++ drivers/cpuidle/cpuidle.c | 4 ++++ 2 files changed, 7 insertions(+) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d132..f8caf040650 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -123,6 +123,9 @@ config GENERIC_TIME_VSYSCALL config ARCH_HAS_CPU_RELAX def_bool y +config ARCH_HAS_DEFAULT_IDLE + def_bool y + config ARCH_HAS_CACHE_LINE_SIZE def_bool y diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index eb2cade562d..bb6e3b33804 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -56,7 +56,11 @@ static void cpuidle_idle_call(void) if (pm_idle_old) pm_idle_old(); else +#if defined(CONFIG_ARCH_HAS_DEFAULT_IDLE) + default_idle(); +#else local_irq_enable(); +#endif return; } -- cgit v1.2.3-70-g09d2 From dc52ddc0e6f45b04780b26fc0813509f8e798c42 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Sat, 18 Oct 2008 20:27:21 -0700 Subject: container freezer: implement freezer cgroup subsystem This patch implements a new freezer subsystem in the control groups framework. It provides a way to stop and resume execution of all tasks in a cgroup by writing in the cgroup filesystem. The freezer subsystem in the container filesystem defines a file named freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the cgroup. Subsequently writing "RUNNING" will unfreeze the tasks in the cgroup. Reading will return the current state. * Examples of usage : # mkdir /containers/freezer # mount -t cgroup -ofreezer freezer /containers # mkdir /containers/0 # echo $some_pid > /containers/0/tasks to get status of the freezer subsystem : # cat /containers/0/freezer.state RUNNING to freeze all tasks in the container : # echo FROZEN > /containers/0/freezer.state # cat /containers/0/freezer.state FREEZING # cat /containers/0/freezer.state FROZEN to unfreeze all tasks in the container : # echo RUNNING > /containers/0/freezer.state # cat /containers/0/freezer.state RUNNING This is the basic mechanism which should do the right thing for user space task in a simple scenario. It's important to note that freezing can be incomplete. In that case we return EBUSY. This means that some tasks in the cgroup are busy doing something that prevents us from completely freezing the cgroup at this time. After EBUSY, the cgroup will remain partially frozen -- reflected by freezer.state reporting "FREEZING" when read. The state will remain "FREEZING" until one of these things happens: 1) Userspace cancels the freezing operation by writing "RUNNING" to the freezer.state file 2) Userspace retries the freezing operation by writing "FROZEN" to the freezer.state file (writing "FREEZING" is not legal and returns EIO) 3) The tasks that blocked the cgroup from entering the "FROZEN" state disappear from the cgroup's set of tasks. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: export thaw_process] Signed-off-by: Cedric Le Goater Signed-off-by: Matt Helsley Acked-by: Serge E. Hallyn Tested-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/Kconfig | 1 + arch/arm/Kconfig | 2 + arch/avr32/Kconfig | 2 + arch/blackfin/Kconfig | 3 + arch/cris/Kconfig | 2 + arch/frv/Kconfig | 2 + arch/h8300/Kconfig | 2 + arch/ia64/Kconfig | 2 + arch/m32r/Kconfig | 2 + arch/m68k/Kconfig | 2 + arch/m68knommu/Kconfig | 2 + arch/mips/Kconfig | 2 + arch/mn10300/Kconfig | 2 + arch/parisc/Kconfig | 2 + arch/powerpc/Kconfig | 2 + arch/s390/Kconfig | 2 + arch/sh/Kconfig | 2 + arch/sparc/Kconfig | 2 + arch/sparc64/Kconfig | 1 + arch/um/Kconfig | 2 + arch/x86/Kconfig | 1 + arch/xtensa/Kconfig | 1 + include/linux/cgroup_subsys.h | 6 + include/linux/freezer.h | 29 ++-- init/Kconfig | 7 + kernel/Kconfig.freezer | 2 + kernel/Makefile | 1 + kernel/cgroup_freezer.c | 366 ++++++++++++++++++++++++++++++++++++++++++ kernel/freezer.c | 32 ++++ kernel/power/Kconfig | 3 - 30 files changed, 465 insertions(+), 22 deletions(-) create mode 100644 kernel/Kconfig.freezer create mode 100644 kernel/cgroup_freezer.c (limited to 'arch/x86/Kconfig') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index a0f642b6a4b..6110197757a 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -70,6 +70,7 @@ config AUTO_IRQ_AFFINITY default y source "init/Kconfig" +source "kernel/Kconfig.freezer" menu "System setup" diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4853f9df37b..df39d20f742 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -192,6 +192,8 @@ config VECTORS_BASE source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "System Type" choice diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index 7c239a91627..33a5b2969eb 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -72,6 +72,8 @@ config GENERIC_BUG source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "System Type and features" source "kernel/time/Kconfig" diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index 8102c79aaa9..29e71ed6b8a 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -64,8 +64,11 @@ config HARDWARE_PM depends on OPROFILE source "init/Kconfig" + source "kernel/Kconfig.preempt" +source "kernel/Kconfig.freezer" + menu "Blackfin Processor Options" comment "Processor and Board Settings" diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 9389d38f222..07335e719bf 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -62,6 +62,8 @@ config HZ source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "General setup" source "fs/Kconfig.binfmt" diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index a5aac1b0756..9d1552a9ee2 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -66,6 +66,8 @@ mainmenu "Fujitsu FR-V Kernel Configuration" source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Fujitsu FR-V system setup" diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index c7966746fbf..bd1995403c6 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -90,6 +90,8 @@ config HZ source "init/Kconfig" +source "kernel/Kconfig.freezer" + source "arch/h8300/Kconfig.cpu" menu "Executable file formats" diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 3b7aa38254a..912c57db2d2 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -7,6 +7,8 @@ mainmenu "IA-64 Linux Kernel Configuration" source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Processor type and features" config IA64 diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 00289c178f8..dbaed4a6381 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -42,6 +42,8 @@ config HZ source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Processor type and features" diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 677c93a490f..836fb66f080 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -62,6 +62,8 @@ mainmenu "Linux/68k Kernel Configuration" source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Platform dependent setup" config EISA diff --git a/arch/m68knommu/Kconfig b/arch/m68knommu/Kconfig index 0a8998315e5..76b66feb74d 100644 --- a/arch/m68knommu/Kconfig +++ b/arch/m68knommu/Kconfig @@ -75,6 +75,8 @@ config NO_IOPORT source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Processor type and features" choice diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index b905744d791..5f149b030c0 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -1885,6 +1885,8 @@ config PROBE_INITRD_HEADER add initrd or initramfs image to the kernel image. Otherwise, say N. +source "kernel/Kconfig.freezer" + menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)" config HW_HAS_EISA diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index dd557c9cf00..9a9f4335887 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -68,6 +68,8 @@ mainmenu "Matsushita MN10300/AM33 Kernel Configuration" source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Matsushita MN10300 system setup" diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 8313fccced5..2bd1f6ef5db 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -90,6 +90,8 @@ config ARCH_MAY_HAVE_PC_FDC source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Processor type and features" diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 380baa1780e..9391199d9e7 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -230,6 +230,8 @@ config PPC_OF_PLATFORM_PCI source "init/Kconfig" +source "kernel/Kconfig.freezer" + source "arch/powerpc/sysdev/Kconfig" source "arch/powerpc/platforms/Kconfig" diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bc581d8a7cd..70b7645ce74 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -78,6 +78,8 @@ config S390 source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "Base setup" comment "Processor type and features" diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 5131d50f851..2ed5713b754 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -106,6 +106,8 @@ config IO_TRAPPED source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "System type" # diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 97671dac12a..e594559c8db 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -37,6 +37,8 @@ config HZ source "init/Kconfig" +source "kernel/Kconfig.freezer" + menu "General machine setup" config SMP diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index 5446e2a499b..035b15af90d 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -96,6 +96,7 @@ config GENERIC_HARDIRQS_NO__DO_IRQ def_bool y source "init/Kconfig" +source "kernel/Kconfig.freezer" menu "Processor type and features" diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 6976812cfb1..393bccfe178 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -229,6 +229,8 @@ endmenu source "init/Kconfig" +source "kernel/Kconfig.freezer" + source "drivers/block/Kconfig" source "arch/um/Kconfig.char" diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bd3c2c53873..49349ba77d8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -193,6 +193,7 @@ config X86_TRAMPOLINE config KTIME_SCALAR def_bool X86_32 source "init/Kconfig" +source "kernel/Kconfig.freezer" menu "Processor type and features" diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 02e417d3d8e..a213260b51e 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -55,6 +55,7 @@ config HZ default 100 source "init/Kconfig" +source "kernel/Kconfig.freezer" menu "Processor type and features" diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index e2877454ec8..9c22396e8b5 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -48,3 +48,9 @@ SUBSYS(devices) #endif /* */ + +#ifdef CONFIG_CGROUP_FREEZER +SUBSYS(freezer) +#endif + +/* */ diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 17e3bb42dd3..8f225339eee 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -46,26 +46,11 @@ static inline bool should_send_signal(struct task_struct *p) /* * Wake up a frozen process - * - * task_lock() is taken to prevent the race with refrigerator() which may - * occur if the freezing of tasks fails. Namely, without the lock, if the - * freezing of tasks failed, thaw_tasks() might have run before a task in - * refrigerator() could call frozen_process(), in which case the task would be - * frozen and no one would thaw it. */ -static inline int thaw_process(struct task_struct *p) -{ - task_lock(p); - if (frozen(p)) { - p->flags &= ~PF_FROZEN; - task_unlock(p); - wake_up_process(p); - return 1; - } - clear_freeze_flag(p); - task_unlock(p); - return 0; -} +extern int __thaw_process(struct task_struct *p); + +/* Takes and releases task alloc lock using task_lock() */ +extern int thaw_process(struct task_struct *p); extern void refrigerator(void); extern int freeze_processes(void); @@ -83,6 +68,12 @@ static inline int try_to_freeze(void) extern bool freeze_task(struct task_struct *p, bool sig_only); extern void cancel_freezing(struct task_struct *p); +#ifdef CONFIG_CGROUP_FREEZER +extern int cgroup_frozen(struct task_struct *task); +#else /* !CONFIG_CGROUP_FREEZER */ +static inline int cgroup_frozen(struct task_struct *task) { return 0; } +#endif /* !CONFIG_CGROUP_FREEZER */ + /* * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it * calls wait_for_completion(&vfork) and reset right after it returns from this diff --git a/init/Kconfig b/init/Kconfig index 5ceff3249a2..8828ed0b205 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -299,6 +299,13 @@ config CGROUP_NS for instance virtual servers and checkpoint/restart jobs. +config CGROUP_FREEZER + bool "control group freezer subsystem" + depends on CGROUPS + help + Provides a way to freeze and unfreeze all tasks in a + cgroup. + config CGROUP_DEVICE bool "Device controller for cgroups" depends on CGROUPS && EXPERIMENTAL diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer new file mode 100644 index 00000000000..a3bb4cb5253 --- /dev/null +++ b/kernel/Kconfig.freezer @@ -0,0 +1,2 @@ +config FREEZER + def_bool PM_SLEEP || CGROUP_FREEZER diff --git a/kernel/Makefile b/kernel/Makefile index e8194d15d5f..066550aa61c 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -56,6 +56,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o +obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o obj-$(CONFIG_UTS_NS) += utsname.o diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c new file mode 100644 index 00000000000..b08722de610 --- /dev/null +++ b/kernel/cgroup_freezer.c @@ -0,0 +1,366 @@ +/* + * cgroup_freezer.c - control group freezer subsystem + * + * Copyright IBM Corporation, 2007 + * + * Author : Cedric Le Goater + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include +#include +#include + +enum freezer_state { + STATE_RUNNING = 0, + STATE_FREEZING, + STATE_FROZEN, +}; + +struct freezer { + struct cgroup_subsys_state css; + enum freezer_state state; + spinlock_t lock; /* protects _writes_ to state */ +}; + +static inline struct freezer *cgroup_freezer( + struct cgroup *cgroup) +{ + return container_of( + cgroup_subsys_state(cgroup, freezer_subsys_id), + struct freezer, css); +} + +static inline struct freezer *task_freezer(struct task_struct *task) +{ + return container_of(task_subsys_state(task, freezer_subsys_id), + struct freezer, css); +} + +int cgroup_frozen(struct task_struct *task) +{ + struct freezer *freezer; + enum freezer_state state; + + task_lock(task); + freezer = task_freezer(task); + state = freezer->state; + task_unlock(task); + + return state == STATE_FROZEN; +} + +/* + * cgroups_write_string() limits the size of freezer state strings to + * CGROUP_LOCAL_BUFFER_SIZE + */ +static const char *freezer_state_strs[] = { + "RUNNING", + "FREEZING", + "FROZEN", +}; + +/* + * State diagram + * Transitions are caused by userspace writes to the freezer.state file. + * The values in parenthesis are state labels. The rest are edge labels. + * + * (RUNNING) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) + * ^ ^ | | + * | \_______RUNNING_______/ | + * \_____________________________RUNNING___________/ + */ + +struct cgroup_subsys freezer_subsys; + +/* Locks taken and their ordering + * ------------------------------ + * css_set_lock + * cgroup_mutex (AKA cgroup_lock) + * task->alloc_lock (AKA task_lock) + * freezer->lock + * task->sighand->siglock + * + * cgroup code forces css_set_lock to be taken before task->alloc_lock + * + * freezer_create(), freezer_destroy(): + * cgroup_mutex [ by cgroup core ] + * + * can_attach(): + * cgroup_mutex + * + * cgroup_frozen(): + * task->alloc_lock (to get task's cgroup) + * + * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): + * task->alloc_lock (to get task's cgroup) + * freezer->lock + * sighand->siglock (if the cgroup is freezing) + * + * freezer_read(): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * + * freezer_write() (freeze): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * sighand->siglock + * + * freezer_write() (unfreeze): + * cgroup_mutex + * freezer->lock + * read_lock css_set_lock (cgroup iterator start) + * task->alloc_lock (to prevent races with freeze_task()) + * sighand->siglock + */ +static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + struct freezer *freezer; + + freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); + if (!freezer) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&freezer->lock); + freezer->state = STATE_RUNNING; + return &freezer->css; +} + +static void freezer_destroy(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + kfree(cgroup_freezer(cgroup)); +} + + +static int freezer_can_attach(struct cgroup_subsys *ss, + struct cgroup *new_cgroup, + struct task_struct *task) +{ + struct freezer *freezer; + int retval = 0; + + /* + * The call to cgroup_lock() in the freezer.state write method prevents + * a write to that file racing against an attach, and hence the + * can_attach() result will remain valid until the attach completes. + */ + freezer = cgroup_freezer(new_cgroup); + if (freezer->state == STATE_FROZEN) + retval = -EBUSY; + return retval; +} + +static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) +{ + struct freezer *freezer; + + task_lock(task); + freezer = task_freezer(task); + task_unlock(task); + + BUG_ON(freezer->state == STATE_FROZEN); + spin_lock_irq(&freezer->lock); + /* Locking avoids race with FREEZING -> RUNNING transitions. */ + if (freezer->state == STATE_FREEZING) + freeze_task(task, true); + spin_unlock_irq(&freezer->lock); +} + +/* + * caller must hold freezer->lock + */ +static void check_if_frozen(struct cgroup *cgroup, + struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + unsigned int nfrozen = 0, ntotal = 0; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + ntotal++; + /* + * Task is frozen or will freeze immediately when next it gets + * woken + */ + if (frozen(task) || + (task_is_stopped_or_traced(task) && freezing(task))) + nfrozen++; + } + + /* + * Transition to FROZEN when no new tasks can be added ensures + * that we never exist in the FROZEN state while there are unfrozen + * tasks. + */ + if (nfrozen == ntotal) + freezer->state = STATE_FROZEN; + cgroup_iter_end(cgroup, &it); +} + +static int freezer_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct freezer *freezer; + enum freezer_state state; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + state = freezer->state; + if (state == STATE_FREEZING) { + /* We change from FREEZING to FROZEN lazily if the cgroup was + * only partially frozen when we exitted write. */ + check_if_frozen(cgroup, freezer); + state = freezer->state; + } + spin_unlock_irq(&freezer->lock); + cgroup_unlock(); + + seq_puts(m, freezer_state_strs[state]); + seq_putc(m, '\n'); + return 0; +} + +static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + unsigned int num_cant_freeze_now = 0; + + freezer->state = STATE_FREEZING; + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + if (!freeze_task(task, true)) + continue; + if (task_is_stopped_or_traced(task) && freezing(task)) + /* + * The freeze flag is set so these tasks will + * immediately go into the fridge upon waking. + */ + continue; + if (!freezing(task) && !freezer_should_skip(task)) + num_cant_freeze_now++; + } + cgroup_iter_end(cgroup, &it); + + return num_cant_freeze_now ? -EBUSY : 0; +} + +static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + int do_wake; + + task_lock(task); + do_wake = __thaw_process(task); + task_unlock(task); + if (do_wake) + wake_up_process(task); + } + cgroup_iter_end(cgroup, &it); + freezer->state = STATE_RUNNING; + + return 0; +} + +static int freezer_change_state(struct cgroup *cgroup, + enum freezer_state goal_state) +{ + struct freezer *freezer; + int retval = 0; + + freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + check_if_frozen(cgroup, freezer); /* may update freezer->state */ + if (goal_state == freezer->state) + goto out; + switch (freezer->state) { + case STATE_RUNNING: + retval = try_to_freeze_cgroup(cgroup, freezer); + break; + case STATE_FREEZING: + if (goal_state == STATE_FROZEN) { + /* Userspace is retrying after + * "/bin/echo FROZEN > freezer.state" returned -EBUSY */ + retval = try_to_freeze_cgroup(cgroup, freezer); + break; + } + /* state == FREEZING and goal_state == RUNNING, so unfreeze */ + case STATE_FROZEN: + retval = unfreeze_cgroup(cgroup, freezer); + break; + default: + break; + } +out: + spin_unlock_irq(&freezer->lock); + + return retval; +} + +static int freezer_write(struct cgroup *cgroup, + struct cftype *cft, + const char *buffer) +{ + int retval; + enum freezer_state goal_state; + + if (strcmp(buffer, freezer_state_strs[STATE_RUNNING]) == 0) + goal_state = STATE_RUNNING; + else if (strcmp(buffer, freezer_state_strs[STATE_FROZEN]) == 0) + goal_state = STATE_FROZEN; + else + return -EIO; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + retval = freezer_change_state(cgroup, goal_state); + cgroup_unlock(); + return retval; +} + +static struct cftype files[] = { + { + .name = "state", + .read_seq_string = freezer_read, + .write_string = freezer_write, + }, +}; + +static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) +{ + return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); +} + +struct cgroup_subsys freezer_subsys = { + .name = "freezer", + .create = freezer_create, + .destroy = freezer_destroy, + .populate = freezer_populate, + .subsys_id = freezer_subsys_id, + .can_attach = freezer_can_attach, + .attach = NULL, + .fork = freezer_fork, + .exit = NULL, +}; diff --git a/kernel/freezer.c b/kernel/freezer.c index cb0931f8930..ba6248b323e 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -120,3 +120,35 @@ void cancel_freezing(struct task_struct *p) spin_unlock_irqrestore(&p->sighand->siglock, flags); } } + +/* + * Wake up a frozen process + * + * task_lock() is needed to prevent the race with refrigerator() which may + * occur if the freezing of tasks fails. Namely, without the lock, if the + * freezing of tasks failed, thaw_tasks() might have run before a task in + * refrigerator() could call frozen_process(), in which case the task would be + * frozen and no one would thaw it. + */ +int __thaw_process(struct task_struct *p) +{ + if (frozen(p)) { + p->flags &= ~PF_FROZEN; + return 1; + } + clear_freeze_flag(p); + return 0; +} + +int thaw_process(struct task_struct *p) +{ + task_lock(p); + if (__thaw_process(p) == 1) { + task_unlock(p); + wake_up_process(p); + return 1; + } + task_unlock(p); + return 0; +} +EXPORT_SYMBOL(thaw_process); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ebdd7f55273..dcd165f92a8 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -85,9 +85,6 @@ config PM_SLEEP depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE default y -config FREEZER - def_bool PM_SLEEP - config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE -- cgit v1.2.3-70-g09d2 From 606576ce816603d9fe1fb453a88bc6eea16ca709 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Oct 2008 19:06:12 -0400 Subject: ftrace: rename FTRACE to FUNCTION_TRACER Due to confusion between the ftrace infrastructure and the gcc profiling tracer "ftrace", this patch renames the config options from FTRACE to FUNCTION_TRACER. The other two names that are offspring from FTRACE DYNAMIC_FTRACE and FTRACE_MCOUNT_RECORD will stay the same. This patch was generated mostly by script, and partially by hand. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- Makefile | 2 +- arch/arm/Kconfig | 4 ++-- arch/arm/boot/compressed/Makefile | 2 +- arch/arm/include/asm/ftrace.h | 2 +- arch/arm/kernel/armksyms.c | 2 +- arch/arm/kernel/entry-common.S | 4 ++-- arch/powerpc/Kconfig | 2 +- arch/powerpc/Makefile | 2 +- arch/powerpc/include/asm/ftrace.h | 2 +- arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/entry_32.S | 2 +- arch/powerpc/kernel/entry_64.S | 2 +- arch/powerpc/kernel/ppc_ksyms.c | 2 +- arch/powerpc/platforms/powermac/Makefile | 2 +- arch/sparc64/Kconfig | 2 +- arch/sparc64/Kconfig.debug | 2 +- arch/sparc64/lib/mcount.S | 4 ++-- arch/x86/Kconfig | 2 +- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/entry_32.S | 4 ++-- arch/x86/kernel/entry_64.S | 4 ++-- arch/x86/kernel/i386_ksyms_32.c | 2 +- arch/x86/kernel/x8664_ksyms_64.c | 2 +- arch/x86/xen/Makefile | 2 +- include/asm-x86/ftrace.h | 4 ++-- include/linux/ftrace.h | 12 ++++++------ kernel/Makefile | 4 ++-- kernel/sysctl.c | 2 +- kernel/trace/Kconfig | 17 +++++++++-------- kernel/trace/Makefile | 6 +++--- kernel/trace/trace.c | 2 +- kernel/trace/trace.h | 2 +- kernel/trace/trace_irqsoff.c | 4 ++-- kernel/trace/trace_sched_wakeup.c | 4 ++-- kernel/trace/trace_selftest.c | 4 ++-- lib/Makefile | 2 +- 36 files changed, 61 insertions(+), 60 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/Makefile b/Makefile index 16e3fbb968a..b7eb70b13ca 100644 --- a/Makefile +++ b/Makefile @@ -536,7 +536,7 @@ KBUILD_CFLAGS += -g KBUILD_AFLAGS += -gdwarf-2 endif -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER KBUILD_CFLAGS += -pg endif diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4853f9df37b..c2f18ea4050 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -16,8 +16,8 @@ config ARM select HAVE_ARCH_KGDB select HAVE_KPROBES if (!XIP_KERNEL) select HAVE_KRETPROBES if (HAVE_KPROBES) - select HAVE_FTRACE if (!XIP_KERNEL) - select HAVE_DYNAMIC_FTRACE if (HAVE_FTRACE) + select HAVE_FUNCTION_TRACER if (!XIP_KERNEL) + select HAVE_DYNAMIC_FTRACE if (HAVE_FUNCTION_TRACER) select HAVE_GENERIC_DMA_COHERENT help The ARM series is a line of low-power-consumption RISC chip designs diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 7a03f200788..c47f2a3f8f8 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -70,7 +70,7 @@ SEDFLAGS = s/TEXT_START/$(ZTEXTADDR)/;s/BSS_START/$(ZBSSADDR)/ targets := vmlinux vmlinux.lds piggy.gz piggy.o font.o font.c \ head.o misc.o $(OBJS) -ifeq ($(CONFIG_FTRACE),y) +ifeq ($(CONFIG_FUNCTION_TRACER),y) ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS)) endif diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h index 584ef9a8e5a..39c8bc1a006 100644 --- a/arch/arm/include/asm/ftrace.h +++ b/arch/arm/include/asm/ftrace.h @@ -1,7 +1,7 @@ #ifndef _ASM_ARM_FTRACE #define _ASM_ARM_FTRACE -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c index 2357b1cf1cf..c74f766ffc1 100644 --- a/arch/arm/kernel/armksyms.c +++ b/arch/arm/kernel/armksyms.c @@ -183,6 +183,6 @@ EXPORT_SYMBOL(_find_next_bit_be); EXPORT_SYMBOL(copy_page); -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER EXPORT_SYMBOL(mcount); #endif diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 3aa14dcc5ba..06269ea375c 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -101,7 +101,7 @@ ENDPROC(ret_from_fork) #undef CALL #define CALL(x) .long x -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) stmdb sp!, {r0-r3, lr} @@ -149,7 +149,7 @@ trace: ftrace_stub: mov pc, lr -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ /*============================================================================= * SWI handler diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 380baa1780e..97d86702e2d 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -112,7 +112,7 @@ config PPC bool default y select HAVE_DYNAMIC_FTRACE - select HAVE_FTRACE + select HAVE_FUNCTION_TRACER select ARCH_WANT_OPTIONAL_GPIOLIB select HAVE_IDE select HAVE_IOREMAP_PROT diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 24dd1a37f8f..1f066706994 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -122,7 +122,7 @@ KBUILD_CFLAGS += -mcpu=powerpc endif # Work around a gcc code-gen bug with -fno-omit-frame-pointer. -ifeq ($(CONFIG_FTRACE),y) +ifeq ($(CONFIG_FUNCTION_TRACER),y) KBUILD_CFLAGS += -mno-sched-epilog endif diff --git a/arch/powerpc/include/asm/ftrace.h b/arch/powerpc/include/asm/ftrace.h index de921326cca..b298f7a631e 100644 --- a/arch/powerpc/include/asm/ftrace.h +++ b/arch/powerpc/include/asm/ftrace.h @@ -1,7 +1,7 @@ #ifndef _ASM_POWERPC_FTRACE #define _ASM_POWERPC_FTRACE -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(_mcount)) #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index fdb58253fa5..92673b43858 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -12,7 +12,7 @@ CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC endif -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 1cbbf703364..7ecc0d1855c 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -1158,7 +1158,7 @@ machine_check_in_rtas: #endif /* CONFIG_PPC_RTAS */ -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE _GLOBAL(mcount) _GLOBAL(_mcount) diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index fd8b4bae9b0..e6d52845854 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -884,7 +884,7 @@ _GLOBAL(enter_prom) mtlr r0 blr -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE _GLOBAL(mcount) _GLOBAL(_mcount) diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 8edc2359c41..260089dccfb 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -68,7 +68,7 @@ EXPORT_SYMBOL(single_step_exception); EXPORT_SYMBOL(sys_sigreturn); #endif -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER EXPORT_SYMBOL(_mcount); #endif diff --git a/arch/powerpc/platforms/powermac/Makefile b/arch/powerpc/platforms/powermac/Makefile index be60d64be7a..50f16939255 100644 --- a/arch/powerpc/platforms/powermac/Makefile +++ b/arch/powerpc/platforms/powermac/Makefile @@ -1,6 +1,6 @@ CFLAGS_bootx_init.o += -fPIC -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER # Do not trace early boot code CFLAGS_REMOVE_bootx_init.o = -pg -mno-sched-epilog endif diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index 5446e2a499b..d269400d286 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -12,7 +12,7 @@ config SPARC64 bool default y select HAVE_DYNAMIC_FTRACE - select HAVE_FTRACE + select HAVE_FUNCTION_TRACER select HAVE_IDE select HAVE_LMB select HAVE_ARCH_KGDB diff --git a/arch/sparc64/Kconfig.debug b/arch/sparc64/Kconfig.debug index d6d32d178fc..c40515c0669 100644 --- a/arch/sparc64/Kconfig.debug +++ b/arch/sparc64/Kconfig.debug @@ -33,7 +33,7 @@ config DEBUG_PAGEALLOC config MCOUNT bool - depends on STACK_DEBUG || FTRACE + depends on STACK_DEBUG || FUNCTION_TRACER default y config FRAME_POINTER diff --git a/arch/sparc64/lib/mcount.S b/arch/sparc64/lib/mcount.S index fad90ddb3a2..7ce9c65f359 100644 --- a/arch/sparc64/lib/mcount.S +++ b/arch/sparc64/lib/mcount.S @@ -93,7 +93,7 @@ mcount: nop 1: #endif -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE mov %o7, %o0 .globl mcount_call @@ -119,7 +119,7 @@ mcount_call: .size _mcount,.-_mcount .size mcount,.-mcount -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER .globl ftrace_stub .type ftrace_stub,#function ftrace_stub: diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 40ee8080956..290e21aa774 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,7 +28,7 @@ config X86 select HAVE_KRETPROBES select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE - select HAVE_FTRACE + select HAVE_FUNCTION_TRACER select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0d41f0343dc..ec3d30136bf 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -6,7 +6,7 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 4e4269c73bb..9d49facc21f 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1149,7 +1149,7 @@ ENDPROC(xen_failsafe_callback) #endif /* CONFIG_XEN */ -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) @@ -1204,7 +1204,7 @@ trace: jmp ftrace_stub END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ .section .rodata,"a" #include "syscall_table_32.S" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 09e7145484c..b86f332c96a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -61,7 +61,7 @@ .code64 -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) retq @@ -138,7 +138,7 @@ trace: jmp ftrace_stub END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index dd7ebee446a..43cec6bdda6 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -5,7 +5,7 @@ #include #include -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER /* mcount is defined in assembly */ EXPORT_SYMBOL(mcount); #endif diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index b545f371b5f..695e426aa35 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -12,7 +12,7 @@ #include #include -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER /* mcount is defined in assembly */ EXPORT_SYMBOL(mcount); #endif diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 313947940a1..6dcefba7836 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,4 @@ -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_spinlock.o = -pg CFLAGS_REMOVE_time.o = -pg diff --git a/include/asm-x86/ftrace.h b/include/asm-x86/ftrace.h index 1bb6f9bbe1a..233bb9b869c 100644 --- a/include/asm-x86/ftrace.h +++ b/include/asm-x86/ftrace.h @@ -1,7 +1,7 @@ #ifndef ASM_X86__FTRACE_H #define ASM_X86__FTRACE_H -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_ADDR ((long)(mcount)) #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ @@ -19,6 +19,6 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) } #endif -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ #endif /* ASM_X86__FTRACE_H */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index a3d46151be1..0e952958915 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -8,7 +8,7 @@ #include #include -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER extern int ftrace_enabled; extern int @@ -36,12 +36,12 @@ void clear_ftrace_function(void); extern void ftrace_stub(unsigned long a0, unsigned long a1); -#else /* !CONFIG_FTRACE */ +#else /* !CONFIG_FUNCTION_TRACER */ # define register_ftrace_function(ops) do { } while (0) # define unregister_ftrace_function(ops) do { } while (0) # define clear_ftrace_function(ops) do { } while (0) static inline void ftrace_kill_atomic(void) { } -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE # define FTRACE_HASHBITS 10 @@ -101,7 +101,7 @@ void ftrace_kill_atomic(void); static inline void tracer_disable(void) { -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER ftrace_enabled = 0; #endif } @@ -113,7 +113,7 @@ static inline void tracer_disable(void) */ static inline int __ftrace_enabled_save(void) { -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER int saved_ftrace_enabled = ftrace_enabled; ftrace_enabled = 0; return saved_ftrace_enabled; @@ -124,7 +124,7 @@ static inline int __ftrace_enabled_save(void) static inline void __ftrace_enabled_restore(int enabled) { -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER ftrace_enabled = enabled; #endif } diff --git a/kernel/Makefile b/kernel/Makefile index 8f9ce7ec21b..85f588a9d0b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -13,7 +13,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ CFLAGS_REMOVE_sched.o = -mno-spe -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files CFLAGS_REMOVE_lockdep.o = -pg CFLAGS_REMOVE_lockdep_proc.o = -pg @@ -86,7 +86,7 @@ obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o -obj-$(CONFIG_FTRACE) += trace/ +obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 617d41e4d6a..619eb9f3acd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -464,7 +464,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER { .ctl_name = CTL_UNNUMBERED, .procname = "ftrace_enabled", diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5866edbc2ed..3533c583df4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -1,11 +1,12 @@ # -# Architectures that offer an FTRACE implementation should select HAVE_FTRACE: +# Architectures that offer an FUNCTION_TRACER implementation should +# select HAVE_FUNCTION_TRACER: # config NOP_TRACER bool -config HAVE_FTRACE +config HAVE_FUNCTION_TRACER bool select NOP_TRACER @@ -28,9 +29,9 @@ config TRACING select STACKTRACE select TRACEPOINTS -config FTRACE +config FUNCTION_TRACER bool "Kernel Function Tracer" - depends on HAVE_FTRACE + depends on HAVE_FUNCTION_TRACER depends on DEBUG_KERNEL select FRAME_POINTER select TRACING @@ -136,9 +137,9 @@ config BOOT_TRACER config STACK_TRACER bool "Trace max stack" - depends on HAVE_FTRACE + depends on HAVE_FUNCTION_TRACER depends on DEBUG_KERNEL - select FTRACE + select FUNCTION_TRACER select STACKTRACE help This special tracer records the maximum stack footprint of the @@ -155,7 +156,7 @@ config STACK_TRACER config DYNAMIC_FTRACE bool "enable/disable ftrace tracepoints dynamically" - depends on FTRACE + depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE depends on DEBUG_KERNEL default y @@ -165,7 +166,7 @@ config DYNAMIC_FTRACE with a No-Op instruction) as they are called. A table is created to dynamically enable them again. - This way a CONFIG_FTRACE kernel is slightly larger, but otherwise + This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise has native performance as long as no tracing is active. The changes to the code are done by a kernel thread that diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index a85dfba88ba..c8228b1a49e 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -1,7 +1,7 @@ # Do not instrument the tracer itself: -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) @@ -10,13 +10,13 @@ CFLAGS_trace_selftest_dynamic.o = -pg obj-y += trace_selftest_dynamic.o endif -obj-$(CONFIG_FTRACE) += libftrace.o +obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o -obj-$(CONFIG_FTRACE) += trace_functions.o +obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d345d649d07..aeb2f2505bc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -851,7 +851,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) preempt_enable_notrace(); } -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER static void function_trace_call(unsigned long ip, unsigned long parent_ip) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f1f99572cde..6889ca48f1f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -335,7 +335,7 @@ void update_max_tr_single(struct trace_array *tr, extern cycle_t ftrace_now(int cpu); -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER void tracing_start_function_trace(void); void tracing_stop_function_trace(void); #else diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index a7db7f040ae..9c74071c10e 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -63,7 +63,7 @@ irq_trace(void) */ static __cacheline_aligned_in_smp unsigned long max_sequence; -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER /* * irqsoff uses its own tracer function to keep the overhead down: */ @@ -104,7 +104,7 @@ static struct ftrace_ops trace_ops __read_mostly = { .func = irqsoff_tracer_call, }; -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ /* * Should this new latency be reported/recorded? diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index fe4a252c236..3ae93f16b56 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -31,7 +31,7 @@ static raw_spinlock_t wakeup_lock = static void __wakeup_reset(struct trace_array *tr); -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER /* * irqsoff uses its own tracer function to keep the overhead down: */ @@ -96,7 +96,7 @@ static struct ftrace_ops trace_ops __read_mostly = { .func = wakeup_tracer_call, }; -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ /* * Should this new latency be reported/recorded? diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 09cf230d7ec..95815d26a04 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -70,7 +70,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) return ret; } -#ifdef CONFIG_FTRACE +#ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE @@ -226,7 +226,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) return ret; } -#endif /* CONFIG_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_IRQSOFF_TRACER int diff --git a/lib/Makefile b/lib/Makefile index 16feaab057b..7cb65d85aeb 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -2,7 +2,7 @@ # Makefile for some libs needed in the kernel. # -ifdef CONFIG_FTRACE +ifdef CONFIG_FUNCTION_TRACER ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) endif -- cgit v1.2.3-70-g09d2 From 27471fdb32e77ecb92f09d4ac5757785b4dc33bc Mon Sep 17 00:00:00 2001 From: Andy Henroid Date: Thu, 9 Oct 2008 11:45:22 -0700 Subject: i7300_idle driver v1.55 The Intel 7300 Memory Controller supports dynamic throttling of memory which can be used to save power when system is idle. This driver does the memory throttling when all CPUs are idle on such a system. Refer to "Intel 7300 Memory Controller Hub (MCH)" datasheet for the config space description. Signed-off-by: Andy Henroid Signed-off-by: Len Brown Signed-off-by: Venkatesh Pallipadi --- MAINTAINERS | 6 + arch/x86/Kconfig | 2 + drivers/Makefile | 1 + drivers/dma/ioat_dma.c | 3 + drivers/idle/Kconfig | 16 ++ drivers/idle/Makefile | 2 + drivers/idle/i7300_idle.c | 674 ++++++++++++++++++++++++++++++++++++++++++++++ include/asm-x86/idle.h | 1 + include/linux/pci_ids.h | 1 + 9 files changed, 706 insertions(+) create mode 100644 drivers/idle/Kconfig create mode 100644 drivers/idle/Makefile create mode 100644 drivers/idle/i7300_idle.c (limited to 'arch/x86/Kconfig') diff --git a/MAINTAINERS b/MAINTAINERS index 8dae4555f10..43f71b0d2a2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2078,6 +2078,12 @@ L: linux-ide@vger.kernel.org L: linux-scsi@vger.kernel.org S: Orphan +IDLE-I7300 +P: Andy Henroid +M: andrew.d.henroid@intel.com +L: linux-pm@lists.linux-foundation.org +S: Supported + IEEE 1394 SUBSYSTEM (drivers/ieee1394) P: Ben Collins M: ben.collins@ubuntu.com diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d132..19cdfe1f237 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1536,6 +1536,8 @@ source "arch/x86/kernel/cpu/cpufreq/Kconfig" source "drivers/cpuidle/Kconfig" +source "drivers/idle/Kconfig" + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 2735bde7347..f443a8a9d46 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -82,6 +82,7 @@ obj-$(CONFIG_EISA) += eisa/ obj-y += lguest/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_CPU_IDLE) += cpuidle/ +obj-y += idle/ obj-$(CONFIG_MMC) += mmc/ obj-$(CONFIG_MEMSTICK) += memstick/ obj-$(CONFIG_NEW_LEDS) += leds/ diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c index bc8c6e3470c..f8396cafa05 100644 --- a/drivers/dma/ioat_dma.c +++ b/drivers/dma/ioat_dma.c @@ -171,6 +171,9 @@ static int ioat_dma_enumerate_channels(struct ioatdma_device *device) xfercap_scale = readb(device->reg_base + IOAT_XFERCAP_OFFSET); xfercap = (xfercap_scale == 0 ? -1 : (1UL << xfercap_scale)); +#if CONFIG_I7300_IDLE_IOAT_CHANNEL + device->common.chancnt--; +#endif for (i = 0; i < device->common.chancnt; i++) { ioat_chan = kzalloc(sizeof(*ioat_chan), GFP_KERNEL); if (!ioat_chan) { diff --git a/drivers/idle/Kconfig b/drivers/idle/Kconfig new file mode 100644 index 00000000000..f5b26dd579e --- /dev/null +++ b/drivers/idle/Kconfig @@ -0,0 +1,16 @@ + +menu "Memory power savings" + +config I7300_IDLE_IOAT_CHANNEL + bool + +config I7300_IDLE + tristate "Intel chipset idle power saving driver" + select I7300_IDLE_IOAT_CHANNEL + depends on X86_64 + help + Enable idle power savings with certain Intel server chipsets. + The chipset must have I/O AT support, such as the Intel 7300. + The power savings depends on the type and quantity of DRAM devices. + +endmenu diff --git a/drivers/idle/Makefile b/drivers/idle/Makefile new file mode 100644 index 00000000000..5f68fc377e2 --- /dev/null +++ b/drivers/idle/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_I7300_IDLE) += i7300_idle.o + diff --git a/drivers/idle/i7300_idle.c b/drivers/idle/i7300_idle.c new file mode 100644 index 00000000000..59d1bbc3cd3 --- /dev/null +++ b/drivers/idle/i7300_idle.c @@ -0,0 +1,674 @@ +/* + * (C) Copyright 2008 Intel Corporation + * Authors: + * Andy Henroid + * Venkatesh Pallipadi + */ + +/* + * Save DIMM power on Intel 7300-based platforms when all CPUs/cores + * are idle, using the DIMM thermal throttling capability. + * + * This driver depends on the Intel integrated DMA controller (I/O AT). + * If the driver for I/O AT (drivers/dma/ioatdma*) is also enabled, + * this driver should work cooperatively. + */ + +/* #define DEBUG */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../dma/ioatdma_hw.h" +#include "../dma/ioatdma_registers.h" + +#define I7300_IDLE_DRIVER_VERSION "1.55" +#define I7300_PRINT "i7300_idle:" + +static int debug; +module_param_named(debug, debug, uint, 0644); +MODULE_PARM_DESC(debug, "Enable debug printks in this driver"); + +#define dprintk(fmt, arg...) \ + do { if (debug) printk(KERN_INFO I7300_PRINT fmt, ##arg); } while (0) + +/* + * Value to set THRTLOW to when initiating throttling + * 0 = No throttling + * 1 = Throttle when > 4 activations per eval window (Maximum throttling) + * 2 = Throttle when > 8 activations + * 168 = Throttle when > 168 activations (Minimum throttling) + */ +#define MAX_THRTLWLIMIT 168 +static uint i7300_idle_thrtlowlm = 1; +module_param_named(thrtlwlimit, i7300_idle_thrtlowlm, uint, 0644); +MODULE_PARM_DESC(thrtlwlimit, + "Value for THRTLOWLM activation field " + "(0 = disable throttle, 1 = Max throttle, 168 = Min throttle)"); + +/* + * simple invocation and duration statistics + */ +static unsigned long total_starts; +static unsigned long total_us; + +#ifdef DEBUG +static unsigned long past_skip; +#endif + +static struct pci_dev *fbd_dev; + +static spinlock_t i7300_idle_lock; +static int i7300_idle_active; + +static u8 i7300_idle_thrtctl_saved; +static u8 i7300_idle_thrtlow_saved; +static u32 i7300_idle_mc_saved; + +static cpumask_t idle_cpumask; +static ktime_t start_ktime; +static unsigned long avg_idle_us; + +static struct dentry *debugfs_dir; + +/* Begin: I/O AT Helper routines */ + +#define IOAT_CHANBASE(ioat_ctl, chan) (ioat_ctl + 0x80 + 0x80 * chan) +/* Snoop control (disable snoops when coherency is not important) */ +#define IOAT_DESC_SADDR_SNP_CTL (1UL << 1) +#define IOAT_DESC_DADDR_SNP_CTL (1UL << 2) + +static struct pci_dev *ioat_dev; +static struct ioat_dma_descriptor *ioat_desc; /* I/O AT desc & data (1 page) */ +static unsigned long ioat_desc_phys; +static u8 *ioat_iomap; /* I/O AT memory-mapped control regs (aka CB_BAR) */ +static u8 *ioat_chanbase; + +/* Start I/O AT memory copy */ +static int i7300_idle_ioat_start(void) +{ + u32 err; + /* Clear error (due to circular descriptor pointer) */ + err = readl(ioat_chanbase + IOAT_CHANERR_OFFSET); + if (err) + writel(err, ioat_chanbase + IOAT_CHANERR_OFFSET); + + writeb(IOAT_CHANCMD_START, ioat_chanbase + IOAT1_CHANCMD_OFFSET); + return 0; +} + +/* Stop I/O AT memory copy */ +static void i7300_idle_ioat_stop(void) +{ + int i; + u8 sts; + + for (i = 0; i < 5; i++) { + writeb(IOAT_CHANCMD_RESET, + ioat_chanbase + IOAT1_CHANCMD_OFFSET); + + udelay(10); + + sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & + IOAT_CHANSTS_DMA_TRANSFER_STATUS; + + if (sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) + break; + + } + + if (i == 5) + dprintk("failed to suspend+reset I/O AT after 5 retries\n"); + +} + +/* Test I/O AT by copying 1024 byte from 2k to 1k */ +static int __init i7300_idle_ioat_selftest(u8 *ctl, + struct ioat_dma_descriptor *desc, unsigned long desc_phys) +{ + u64 chan_sts; + + memset(desc, 0, 2048); + memset((u8 *) desc + 2048, 0xab, 1024); + + desc[0].size = 1024; + desc[0].ctl = 0; + desc[0].src_addr = desc_phys + 2048; + desc[0].dst_addr = desc_phys + 1024; + desc[0].next = 0; + + writeb(IOAT_CHANCMD_RESET, ioat_chanbase + IOAT1_CHANCMD_OFFSET); + writeb(IOAT_CHANCMD_START, ioat_chanbase + IOAT1_CHANCMD_OFFSET); + + udelay(1000); + + chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & + IOAT_CHANSTS_DMA_TRANSFER_STATUS; + + if (chan_sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_DONE) { + /* Not complete, reset the channel */ + writeb(IOAT_CHANCMD_RESET, + ioat_chanbase + IOAT1_CHANCMD_OFFSET); + return -1; + } + + if (*(u32 *) ((u8 *) desc + 3068) != 0xabababab || + *(u32 *) ((u8 *) desc + 2044) != 0xabababab) { + dprintk("Data values src 0x%x, dest 0x%x, memset 0x%x\n", + *(u32 *) ((u8 *) desc + 2048), + *(u32 *) ((u8 *) desc + 1024), + *(u32 *) ((u8 *) desc + 3072)); + return -1; + } + return 0; +} + +static struct device dummy_dma_dev = { + .bus_id = "fallback device", + .coherent_dma_mask = DMA_64BIT_MASK, + .dma_mask = &dummy_dma_dev.coherent_dma_mask, +}; + +/* Setup and initialize I/O AT */ +/* This driver needs I/O AT as the throttling takes effect only when there is + * some memory activity. We use I/O AT to set up a dummy copy, while all CPUs + * go idle and memory is throttled. + */ +static int __init i7300_idle_ioat_init(void) +{ + u8 ver, chan_count, ioat_chan; + u16 chan_ctl; + + ioat_iomap = (u8 *) ioremap_nocache(pci_resource_start(ioat_dev, 0), + pci_resource_len(ioat_dev, 0)); + + if (!ioat_iomap) { + printk(KERN_ERR I7300_PRINT "failed to map I/O AT registers\n"); + goto err_ret; + } + + ver = readb(ioat_iomap + IOAT_VER_OFFSET); + if (ver != IOAT_VER_1_2) { + printk(KERN_ERR I7300_PRINT "unknown I/O AT version (%u.%u)\n", + ver >> 4, ver & 0xf); + goto err_unmap; + } + + chan_count = readb(ioat_iomap + IOAT_CHANCNT_OFFSET); + if (!chan_count) { + printk(KERN_ERR I7300_PRINT "unexpected # of I/O AT channels " + "(%u)\n", + chan_count); + goto err_unmap; + } + + ioat_chan = chan_count - 1; + ioat_chanbase = IOAT_CHANBASE(ioat_iomap, ioat_chan); + + chan_ctl = readw(ioat_chanbase + IOAT_CHANCTRL_OFFSET); + if (chan_ctl & IOAT_CHANCTRL_CHANNEL_IN_USE) { + printk(KERN_ERR I7300_PRINT "channel %d in use\n", ioat_chan); + goto err_unmap; + } + + writew(IOAT_CHANCTRL_CHANNEL_IN_USE, + ioat_chanbase + IOAT_CHANCTRL_OFFSET); + + ioat_desc = (struct ioat_dma_descriptor *)dma_alloc_coherent( + &dummy_dma_dev, 4096, + (dma_addr_t *)&ioat_desc_phys, GFP_KERNEL); + if (!ioat_desc) { + printk(KERN_ERR I7300_PRINT "failed to allocate I/O AT desc\n"); + goto err_mark_unused; + } + + writel(ioat_desc_phys & 0xffffffffUL, + ioat_chanbase + IOAT1_CHAINADDR_OFFSET_LOW); + writel(ioat_desc_phys >> 32, + ioat_chanbase + IOAT1_CHAINADDR_OFFSET_HIGH); + + if (i7300_idle_ioat_selftest(ioat_iomap, ioat_desc, ioat_desc_phys)) { + printk(KERN_ERR I7300_PRINT "I/O AT self-test failed\n"); + goto err_free; + } + + /* Setup circular I/O AT descriptor chain */ + ioat_desc[0].ctl = IOAT_DESC_SADDR_SNP_CTL | IOAT_DESC_DADDR_SNP_CTL; + ioat_desc[0].src_addr = ioat_desc_phys + 2048; + ioat_desc[0].dst_addr = ioat_desc_phys + 3072; + ioat_desc[0].size = 128; + ioat_desc[0].next = ioat_desc_phys + sizeof(struct ioat_dma_descriptor); + + ioat_desc[1].ctl = ioat_desc[0].ctl; + ioat_desc[1].src_addr = ioat_desc[0].src_addr; + ioat_desc[1].dst_addr = ioat_desc[0].dst_addr; + ioat_desc[1].size = ioat_desc[0].size; + ioat_desc[1].next = ioat_desc_phys; + + return 0; + +err_free: + dma_free_coherent(&dummy_dma_dev, 4096, (void *)ioat_desc, 0); +err_mark_unused: + writew(0, ioat_chanbase + IOAT_CHANCTRL_OFFSET); +err_unmap: + iounmap(ioat_iomap); +err_ret: + return -ENODEV; +} + +/* Cleanup I/O AT */ +static void __exit i7300_idle_ioat_exit(void) +{ + int i; + u64 chan_sts; + + i7300_idle_ioat_stop(); + + /* Wait for a while for the channel to halt before releasing */ + for (i = 0; i < 10; i++) { + writeb(IOAT_CHANCMD_RESET, + ioat_chanbase + IOAT1_CHANCMD_OFFSET); + + chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & + IOAT_CHANSTS_DMA_TRANSFER_STATUS; + + if (chan_sts != IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) { + writew(0, ioat_chanbase + IOAT_CHANCTRL_OFFSET); + break; + } + udelay(1000); + } + + chan_sts = readq(ioat_chanbase + IOAT1_CHANSTS_OFFSET) & + IOAT_CHANSTS_DMA_TRANSFER_STATUS; + + /* + * We tried to reset multiple times. If IO A/T channel is still active + * flag an error and return without cleanup. Memory leak is better + * than random corruption in that extreme error situation. + */ + if (chan_sts == IOAT_CHANSTS_DMA_TRANSFER_STATUS_ACTIVE) { + printk(KERN_ERR I7300_PRINT "Unable to stop IO A/T channels." + " Not freeing resources\n"); + return; + } + + dma_free_coherent(&dummy_dma_dev, 4096, (void *)ioat_desc, 0); + iounmap(ioat_iomap); +} + +/* End: I/O AT Helper routines */ + +#define DIMM_THRTLOW 0x64 +#define DIMM_THRTCTL 0x67 +#define DIMM_THRTCTL_THRMHUNT (1UL << 0) +#define DIMM_MC 0x40 +#define DIMM_GTW_MODE (1UL << 17) +#define DIMM_GBLACT 0x60 + +/* + * Keep track of an exponential-decaying average of recent idle durations. + * The latest duration gets DURATION_WEIGHT_PCT percentage weight + * in this average, with the old average getting the remaining weight. + * + * High weights emphasize recent history, low weights include long history. + */ +#define DURATION_WEIGHT_PCT 55 + +/* + * When the decaying average of recent durations or the predicted duration + * of the next timer interrupt is shorter than duration_threshold, the + * driver will decline to throttle. + */ +#define DURATION_THRESHOLD_US 100 + + +/* Store DIMM thermal throttle configuration */ +static int i7300_idle_thrt_save(void) +{ + u32 new_mc_val; + u8 gblactlm; + + pci_read_config_byte(fbd_dev, DIMM_THRTCTL, &i7300_idle_thrtctl_saved); + pci_read_config_byte(fbd_dev, DIMM_THRTLOW, &i7300_idle_thrtlow_saved); + pci_read_config_dword(fbd_dev, DIMM_MC, &i7300_idle_mc_saved); + /* + * Make sure we have Global Throttling Window Mode set to have a + * "short" window. This (mostly) works around an issue where + * throttling persists until the end of the global throttling window + * size. On the tested system, this was resulting in a maximum of + * 64 ms to exit throttling (average 32 ms). The actual numbers + * depends on system frequencies. Setting the short window reduces + * this by a factor of 4096. + * + * We will only do this only if the system is set for + * unlimited-activations while in open-loop throttling (i.e., when + * Global Activation Throttle Limit is zero). + */ + pci_read_config_byte(fbd_dev, DIMM_GBLACT, &gblactlm); + dprintk("thrtctl_saved = 0x%02x, thrtlow_saved = 0x%02x\n", + i7300_idle_thrtctl_saved, + i7300_idle_thrtlow_saved); + dprintk("mc_saved = 0x%08x, gblactlm = 0x%02x\n", + i7300_idle_mc_saved, + gblactlm); + if (gblactlm == 0) { + new_mc_val = i7300_idle_mc_saved | DIMM_GTW_MODE; + pci_write_config_dword(fbd_dev, DIMM_MC, new_mc_val); + return 0; + } else { + dprintk("could not set GTW_MODE = 1 (OLTT enabled)\n"); + return -ENODEV; + } +} + +/* Restore DIMM thermal throttle configuration */ +static void i7300_idle_thrt_restore(void) +{ + pci_write_config_dword(fbd_dev, DIMM_MC, i7300_idle_mc_saved); + pci_write_config_byte(fbd_dev, DIMM_THRTLOW, i7300_idle_thrtlow_saved); + pci_write_config_byte(fbd_dev, DIMM_THRTCTL, i7300_idle_thrtctl_saved); +} + +/* Enable DIMM thermal throttling */ +static void i7300_idle_start(void) +{ + u8 new_ctl; + u8 limit; + + new_ctl = i7300_idle_thrtctl_saved & ~DIMM_THRTCTL_THRMHUNT; + pci_write_config_byte(fbd_dev, DIMM_THRTCTL, new_ctl); + + limit = i7300_idle_thrtlowlm; + if (unlikely(limit > MAX_THRTLWLIMIT)) + limit = MAX_THRTLWLIMIT; + + pci_write_config_byte(fbd_dev, DIMM_THRTLOW, limit); + + new_ctl = i7300_idle_thrtctl_saved | DIMM_THRTCTL_THRMHUNT; + pci_write_config_byte(fbd_dev, DIMM_THRTCTL, new_ctl); +} + +/* Disable DIMM thermal throttling */ +static void i7300_idle_stop(void) +{ + u8 new_ctl; + u8 got_ctl; + + new_ctl = i7300_idle_thrtctl_saved & ~DIMM_THRTCTL_THRMHUNT; + pci_write_config_byte(fbd_dev, DIMM_THRTCTL, new_ctl); + + pci_write_config_byte(fbd_dev, DIMM_THRTLOW, i7300_idle_thrtlow_saved); + pci_write_config_byte(fbd_dev, DIMM_THRTCTL, i7300_idle_thrtctl_saved); + pci_read_config_byte(fbd_dev, DIMM_THRTCTL, &got_ctl); + WARN_ON_ONCE(got_ctl != i7300_idle_thrtctl_saved); +} + + +/* + * i7300_avg_duration_check() + * return 0 if the decaying average of recent idle durations is + * more than DURATION_THRESHOLD_US + */ +static int i7300_avg_duration_check(void) +{ + if (avg_idle_us >= DURATION_THRESHOLD_US) + return 0; + +#ifdef DEBUG + past_skip++; +#endif + return 1; +} + +/* Idle notifier to look at idle CPUs */ +static int i7300_idle_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + unsigned long flags; + ktime_t now_ktime; + static ktime_t idle_begin_time; + static int time_init = 1; + + if (!i7300_idle_thrtlowlm) + return 0; + + if (unlikely(time_init)) { + time_init = 0; + idle_begin_time = ktime_get(); + } + + spin_lock_irqsave(&i7300_idle_lock, flags); + if (val == IDLE_START) { + + cpu_set(smp_processor_id(), idle_cpumask); + + if (cpus_weight(idle_cpumask) != num_online_cpus()) + goto end; + + now_ktime = ktime_get(); + idle_begin_time = now_ktime; + + if (i7300_avg_duration_check()) + goto end; + + i7300_idle_active = 1; + total_starts++; + start_ktime = now_ktime; + + i7300_idle_start(); + i7300_idle_ioat_start(); + + } else if (val == IDLE_END) { + cpu_clear(smp_processor_id(), idle_cpumask); + if (cpus_weight(idle_cpumask) == (num_online_cpus() - 1)) { + /* First CPU coming out of idle */ + u64 idle_duration_us; + + now_ktime = ktime_get(); + + idle_duration_us = ktime_to_us(ktime_sub + (now_ktime, idle_begin_time)); + + avg_idle_us = + ((100 - DURATION_WEIGHT_PCT) * avg_idle_us + + DURATION_WEIGHT_PCT * idle_duration_us) / 100; + + if (i7300_idle_active) { + ktime_t idle_ktime; + + idle_ktime = ktime_sub(now_ktime, start_ktime); + total_us += ktime_to_us(idle_ktime); + + i7300_idle_ioat_stop(); + i7300_idle_stop(); + i7300_idle_active = 0; + } + } + } +end: + spin_unlock_irqrestore(&i7300_idle_lock, flags); + return 0; +} + +static struct notifier_block i7300_idle_nb = { + .notifier_call = i7300_idle_notifier, +}; + +/* + * I/O AT controls (PCI bus 0 device 8 function 0) + * DIMM controls (PCI bus 0 device 16 function 1) + */ +#define IOAT_BUS 0 +#define IOAT_DEVFN PCI_DEVFN(8, 0) +#define MEMCTL_BUS 0 +#define MEMCTL_DEVFN PCI_DEVFN(16, 1) + +struct fbd_ioat { + unsigned int vendor; + unsigned int ioat_dev; +}; + +/* + * The i5000 chip-set has the same hooks as the i7300 + * but support is disabled by default because this driver + * has not been validated on that platform. + */ +#define SUPPORT_I5000 0 + +static const struct fbd_ioat fbd_ioat_list[] = { + {PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_CNB}, +#if SUPPORT_I5000 + {PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT}, +#endif + {0, 0} +}; + +/* table of devices that work with this driver */ +static const struct pci_device_id pci_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_FBD_CNB) }, +#if SUPPORT_I5000 + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5000_ERR) }, +#endif + { } /* Terminating entry */ +}; + +MODULE_DEVICE_TABLE(pci, pci_tbl); + +/* Check for known platforms with I/O-AT */ +static int __init i7300_idle_platform_probe(void) +{ + int i; + + fbd_dev = pci_get_bus_and_slot(MEMCTL_BUS, MEMCTL_DEVFN); + if (!fbd_dev) + return -ENODEV; + + for (i = 0; pci_tbl[i].vendor != 0; i++) { + if (fbd_dev->vendor == pci_tbl[i].vendor && + fbd_dev->device == pci_tbl[i].device) { + break; + } + } + if (pci_tbl[i].vendor == 0) + return -ENODEV; + + ioat_dev = pci_get_bus_and_slot(IOAT_BUS, IOAT_DEVFN); + if (!ioat_dev) + return -ENODEV; + + for (i = 0; fbd_ioat_list[i].vendor != 0; i++) { + if (ioat_dev->vendor == fbd_ioat_list[i].vendor && + ioat_dev->device == fbd_ioat_list[i].ioat_dev) { + return 0; + } + } + return -ENODEV; +} + +int stats_open_generic(struct inode *inode, struct file *fp) +{ + fp->private_data = inode->i_private; + return 0; +} + +static ssize_t stats_read_ul(struct file *fp, char __user *ubuf, size_t count, + loff_t *off) +{ + unsigned long *p = fp->private_data; + char buf[32]; + int len; + + len = snprintf(buf, 32, "%lu\n", *p); + return simple_read_from_buffer(ubuf, count, off, buf, len); +} + +static const struct file_operations idle_fops = { + .open = stats_open_generic, + .read = stats_read_ul, +}; + +struct debugfs_file_info { + void *ptr; + char name[32]; + struct dentry *file; +} debugfs_file_list[] = { + {&total_starts, "total_starts", NULL}, + {&total_us, "total_us", NULL}, +#ifdef DEBUG + {&past_skip, "past_skip", NULL}, +#endif + {NULL, "", NULL} + }; + +static int __init i7300_idle_init(void) +{ + spin_lock_init(&i7300_idle_lock); + cpus_clear(idle_cpumask); + total_us = 0; + + if (i7300_idle_platform_probe()) + return -ENODEV; + + if (i7300_idle_thrt_save()) + return -ENODEV; + + if (i7300_idle_ioat_init()) + return -ENODEV; + + debugfs_dir = debugfs_create_dir("i7300_idle", NULL); + if (debugfs_dir) { + int i = 0; + + while (debugfs_file_list[i].ptr != NULL) { + debugfs_file_list[i].file = debugfs_create_file( + debugfs_file_list[i].name, + S_IRUSR, + debugfs_dir, + debugfs_file_list[i].ptr, + &idle_fops); + i++; + } + } + + idle_notifier_register(&i7300_idle_nb); + + printk(KERN_INFO "i7300_idle: loaded v%s\n", I7300_IDLE_DRIVER_VERSION); + return 0; +} + +static void __exit i7300_idle_exit(void) +{ + idle_notifier_unregister(&i7300_idle_nb); + + if (debugfs_dir) { + int i = 0; + + while (debugfs_file_list[i].file != NULL) { + debugfs_remove(debugfs_file_list[i].file); + i++; + } + + debugfs_remove(debugfs_dir); + } + i7300_idle_thrt_restore(); + i7300_idle_ioat_exit(); +} + +module_init(i7300_idle_init); +module_exit(i7300_idle_exit); + +MODULE_AUTHOR("Andy Henroid "); +MODULE_DESCRIPTION("Intel Chipset DIMM Idle Power Saving Driver v" + I7300_IDLE_DRIVER_VERSION); +MODULE_LICENSE("GPL"); diff --git a/include/asm-x86/idle.h b/include/asm-x86/idle.h index cbb64912361..54ce018d4b6 100644 --- a/include/asm-x86/idle.h +++ b/include/asm-x86/idle.h @@ -6,6 +6,7 @@ struct notifier_block; void idle_notifier_register(struct notifier_block *n); +void idle_notifier_unregister(struct notifier_block *n); void enter_idle(void); void exit_idle(void); diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index f1624b39675..efb786d11f2 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -2422,6 +2422,7 @@ #define PCI_DEVICE_ID_INTEL_MCH_PC1 0x359a #define PCI_DEVICE_ID_INTEL_E7525_MCH 0x359e #define PCI_DEVICE_ID_INTEL_IOAT_CNB 0x360b +#define PCI_DEVICE_ID_INTEL_FBD_CNB 0x360c #define PCI_DEVICE_ID_INTEL_ICH10_0 0x3a14 #define PCI_DEVICE_ID_INTEL_ICH10_1 0x3a16 #define PCI_DEVICE_ID_INTEL_ICH10_2 0x3a18 -- cgit v1.2.3-70-g09d2 From 9e899816d126cc6f7d405c349f65363214fe7399 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 22 Oct 2008 12:33:16 +0200 Subject: x86, mm: enable GBPAGES option by default DIRECT_GBPAGES was under DEBUG_KERNEL && EXPERIMENTAL and disabled by default. Turn it on by default and put it under EMBEDDED. Signed-off-by: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 9 +++++++++ arch/x86/Kconfig.debug | 12 ------------ 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5b9b12321ad..c00aefcb47d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -946,6 +946,15 @@ config X86_PAE config ARCH_PHYS_ADDR_T_64BIT def_bool X86_64 || X86_PAE +config DIRECT_GBPAGES + bool "Enable 1GB pages for kernel pagetables" if EMBEDDED + default y + depends on X86_64 + help + Allow the kernel linear mapping to use 1GB pages on CPUs that + support it. This can improve the kernel's performance a tiny bit by + reducing TLB pressure. If in doubt, say "Y". + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2a3dfbd5e67..567fe543e09 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -114,18 +114,6 @@ config DEBUG_RODATA data. This is recommended so that we can catch kernel bugs sooner. If in doubt, say "Y". -config DIRECT_GBPAGES - bool "Enable gbpages-mapped kernel pagetables" - depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64 - help - Enable gigabyte pages support (if the CPU supports it). This can - improve the kernel's performance a tiny bit by reducing TLB - pressure. - - This is experimental code. - - If in doubt, say "N". - config DEBUG_RODATA_TEST bool "Testcase for the DEBUG_RODATA feature" depends on DEBUG_RODATA -- cgit v1.2.3-70-g09d2 From 7a5276889cfa96619bf863c87581005f46139986 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 30 Oct 2008 10:38:24 +0000 Subject: x86: simplify X86_MPPARSE config option Impact: cleanup Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 350bee1d54d..f843de13e24 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -235,21 +235,13 @@ config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER -if ACPI config X86_MPPARSE - def_bool y - bool "Enable MPS table" + bool "Enable MPS table" if ACPI + default y depends on X86_LOCAL_APIC help For old smp systems that do not have proper acpi support. Newer systems (esp with 64bit cpus) with acpi support, MADT and DSDT will override it -endif - -if !ACPI -config X86_MPPARSE - def_bool y - depends on X86_LOCAL_APIC -endif choice prompt "Subarchitecture Type" -- cgit v1.2.3-70-g09d2 From b3572e361b6b2ac5e724bc4bb932b7774b720b95 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Thu, 30 Oct 2008 16:00:59 -0500 Subject: x86/voyager: fix compile breakage caused by dc1e35c6e95e8923cf1d3510438b63c600fee1e2 Impact: build fix on x86/Voyager Given commits like this: | Author: Suresh Siddha | Date: Tue Jul 29 10:29:19 2008 -0700 | | x86, xsave: enable xsave/xrstor on cpus with xsave support Which deliberately expose boot cpu dependence to pieces of the system, I think it's time to explicitly have a variable for it to prevent this continual misassumption that the boot CPU is zero. Signed-off-by: James Bottomley Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++++ arch/x86/include/asm/smp.h | 6 ++++++ arch/x86/kernel/cpu/common.c | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 350bee1d54d..2a40c4c6dd7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -231,6 +231,10 @@ config SMP If you don't know what to do here, say N. +config X86_HAS_BOOT_CPU_ID + def_bool y + depends on X86_VOYAGER + config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 2766021aef8..d12811ce51d 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -225,5 +225,11 @@ static inline int hard_smp_processor_id(void) #endif /* CONFIG_X86_LOCAL_APIC */ +#ifdef CONFIG_X86_HAS_BOOT_CPU_ID +extern unsigned char boot_cpu_id; +#else +#define boot_cpu_id 0 +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_SMP_H */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 25581dcb280..93e9393ea64 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1134,7 +1134,7 @@ void __cpuinit cpu_init(void) /* * Boot processor to setup the FP and extended state context info. */ - if (!smp_processor_id()) + if (smp_processor_id() == boot_cpu_id) init_thread_xstate(); xsave_init(); -- cgit v1.2.3-70-g09d2 From 3555105333ae55414d0fe051557bd7dc590f5255 Mon Sep 17 00:00:00 2001 From: Gary Hade Date: Fri, 31 Oct 2008 10:52:03 -0700 Subject: x86: add memory hotremove config option Impact: enable CONFIG_MEMORY_HOTREMOVE feature on x86. (default-off) Memory hotremove functionality can currently be configured into the ia64, powerpc, and s390 kernels. This patch makes it possible to configure the memory hotremove functionality into the x86 kernel as well. Signed-off-by: Badari Pulavarty Signed-off-by: Gary Hade Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c00aefcb47d..25e71152611 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1492,6 +1492,10 @@ config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y depends on X86_64 || (X86_32 && HIGHMEM) +config ARCH_ENABLE_MEMORY_HOTREMOVE + def_bool y + depends on MEMORY_HOTPLUG + config HAVE_ARCH_EARLY_PFN_TO_NID def_bool X86_64 depends on NUMA -- cgit v1.2.3-70-g09d2 From e5beae16901795223d677f15aa2fe192976278ee Mon Sep 17 00:00:00 2001 From: Keith Packard Date: Mon, 3 Nov 2008 18:21:45 +0100 Subject: io mapping: clean up #ifdefs Impact: cleanup clean up ifdefs: change #ifdef CONFIG_X86_32/64 to CONFIG_HAVE_ATOMIC_IOMAP. flip around the #ifdef sections to clean up the structure. Signed-off-by: Keith Packard Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++++ include/linux/io-mapping.h | 43 +++++++++++++++++++++++++------------------ 2 files changed, 29 insertions(+), 18 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6f20718d315..e60c59b81bd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1894,6 +1894,10 @@ config SYSVIPC_COMPAT endmenu +config HAVE_ATOMIC_IOMAP + def_bool y + depends on X86_32 + source "net/Kconfig" source "drivers/Kconfig" diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 1b566993db6..82df31726a5 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -33,86 +33,93 @@ /* this struct isn't actually defined anywhere */ struct io_mapping; -#ifdef CONFIG_X86_64 +#ifdef CONFIG_HAVE_ATOMIC_IOMAP + +/* + * For small address space machines, mapping large objects + * into the kernel virtual space isn't practical. Where + * available, use fixmap support to dynamically map pages + * of the object at run time. + */ -/* Create the io_mapping object*/ static inline struct io_mapping * io_mapping_create_wc(unsigned long base, unsigned long size) { - return (struct io_mapping *) ioremap_wc(base, size); + return (struct io_mapping *) base; } static inline void io_mapping_free(struct io_mapping *mapping) { - iounmap(mapping); } /* Atomic map/unmap */ static inline void * io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset) { - return ((char *) mapping) + offset; + offset += (unsigned long) mapping; + return iomap_atomic_prot_pfn(offset >> PAGE_SHIFT, KM_USER0, + __pgprot(__PAGE_KERNEL_WC)); } static inline void io_mapping_unmap_atomic(void *vaddr) { + iounmap_atomic(vaddr, KM_USER0); } -/* Non-atomic map/unmap */ static inline void * io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset) { - return ((char *) mapping) + offset; + offset += (unsigned long) mapping; + return ioremap_wc(offset, PAGE_SIZE); } static inline void io_mapping_unmap(void *vaddr) { + iounmap(vaddr); } -#endif /* CONFIG_X86_64 */ +#else -#ifdef CONFIG_X86_32 +/* Create the io_mapping object*/ static inline struct io_mapping * io_mapping_create_wc(unsigned long base, unsigned long size) { - return (struct io_mapping *) base; + return (struct io_mapping *) ioremap_wc(base, size); } static inline void io_mapping_free(struct io_mapping *mapping) { + iounmap(mapping); } /* Atomic map/unmap */ static inline void * io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset) { - offset += (unsigned long) mapping; - return iomap_atomic_prot_pfn(offset >> PAGE_SHIFT, KM_USER0, - __pgprot(__PAGE_KERNEL_WC)); + return ((char *) mapping) + offset; } static inline void io_mapping_unmap_atomic(void *vaddr) { - iounmap_atomic(vaddr, KM_USER0); } +/* Non-atomic map/unmap */ static inline void * io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset) { - offset += (unsigned long) mapping; - return ioremap_wc(offset, PAGE_SIZE); + return ((char *) mapping) + offset; } static inline void io_mapping_unmap(void *vaddr) { - iounmap(vaddr); } -#endif /* CONFIG_X86_32 */ + +#endif /* HAVE_ATOMIC_IOMAP */ #endif /* _LINUX_IO_MAPPING_H */ -- cgit v1.2.3-70-g09d2 From 60a7ecf42661f2b22168751298592da6ee210c9e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Nov 2008 16:05:44 -0500 Subject: ftrace: add quick function trace stop Impact: quick start and stop of function tracer This patch adds a way to disable the function tracer quickly without the need to run kstop_machine. It adds a new variable called function_trace_stop which will stop the calls to functions from mcount when set. This is just an on/off switch and does not handle recursion like preempt_disable(). It's main purpose is to help other tracers/debuggers start and stop tracing fuctions without the need to call kstop_machine. The config option HAVE_FUNCTION_TRACE_MCOUNT_TEST is added for archs that implement the testing of the function_trace_stop in the mcount arch dependent code. Otherwise, the test is done in the C code. x86 is the only arch at the moment that supports this. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/kernel/entry_32.S | 6 ++++++ arch/x86/kernel/entry_64.S | 5 +++++ include/linux/ftrace.h | 30 +++++++++++++++++++++++++++++ kernel/trace/Kconfig | 7 +++++++ kernel/trace/ftrace.c | 47 ++++++++++++++++++++++++++++++++++++---------- 6 files changed, 86 insertions(+), 10 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6f20718d315..d09e812c622 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -29,6 +29,7 @@ config X86 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER + select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER select HAVE_ARCH_TRACEHOOK diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 28b597ef9ca..9134de814c9 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1157,6 +1157,9 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + pushl %eax pushl %ecx pushl %edx @@ -1180,6 +1183,9 @@ END(ftrace_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + cmpl $ftrace_stub, ftrace_trace_function jnz trace .globl ftrace_stub diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b86f332c96a..08aa6b10933 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -68,6 +68,8 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) + cmpl $0, function_trace_stop + jne ftrace_stub /* taken from glibc */ subq $0x38, %rsp @@ -103,6 +105,9 @@ END(ftrace_caller) #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) + cmpl $0, function_trace_stop + jne ftrace_stub + cmpq $ftrace_stub, ftrace_trace_function jnz trace .globl ftrace_stub diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4642959e5bd..794ab907dbf 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -23,6 +23,34 @@ struct ftrace_ops { struct ftrace_ops *next; }; +extern int function_trace_stop; + +/** + * ftrace_stop - stop function tracer. + * + * A quick way to stop the function tracer. Note this an on off switch, + * it is not something that is recursive like preempt_disable. + * This does not disable the calling of mcount, it only stops the + * calling of functions from mcount. + */ +static inline void ftrace_stop(void) +{ + function_trace_stop = 1; +} + +/** + * ftrace_start - start the function tracer. + * + * This function is the inverse of ftrace_stop. This does not enable + * the function tracing if the function tracer is disabled. This only + * sets the function tracer flag to continue calling the functions + * from mcount. + */ +static inline void ftrace_start(void) +{ + function_trace_stop = 0; +} + /* * The ftrace_ops must be a static and should also * be read_mostly. These functions do modify read_mostly variables @@ -41,6 +69,8 @@ extern void ftrace_stub(unsigned long a0, unsigned long a1); # define unregister_ftrace_function(ops) do { } while (0) # define clear_ftrace_function(ops) do { } while (0) static inline void ftrace_kill(void) { } +static inline void ftrace_stop(void) { } +static inline void ftrace_start(void) { } #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 33dbefd471e..fc4febc3334 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -9,6 +9,13 @@ config NOP_TRACER config HAVE_FUNCTION_TRACER bool +config HAVE_FUNCTION_TRACE_MCOUNT_TEST + bool + help + This gets selected when the arch tests the function_trace_stop + variable at the mcount call site. Otherwise, this variable + is tested by the called function. + config HAVE_DYNAMIC_FTRACE bool diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4a39d24568c..896c71f0f4c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -47,6 +47,9 @@ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; +/* Quick disabling of function tracer. */ +int function_trace_stop; + /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -63,6 +66,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly = static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; +ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { @@ -88,8 +92,23 @@ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; + __ftrace_trace_function = ftrace_stub; } +#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST +/* + * For those archs that do not test ftrace_trace_stop in their + * mcount call site, we need to do it from C. + */ +static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) +{ + if (function_trace_stop) + return; + + __ftrace_trace_function(ip, parent_ip); +} +#endif + static int __register_ftrace_function(struct ftrace_ops *ops) { /* should not be called from interrupt context */ @@ -110,10 +129,18 @@ static int __register_ftrace_function(struct ftrace_ops *ops) * For one func, simply call it directly. * For more than one func, call the chain. */ +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST if (ops->next == &ftrace_list_end) ftrace_trace_function = ops->func; else ftrace_trace_function = ftrace_list_func; +#else + if (ops->next == &ftrace_list_end) + __ftrace_trace_function = ops->func; + else + __ftrace_trace_function = ftrace_list_func; + ftrace_trace_function = ftrace_test_stop_func; +#endif } spin_unlock(&ftrace_lock); @@ -526,7 +553,7 @@ static void ftrace_run_update_code(int command) } static ftrace_func_t saved_ftrace_func; -static int ftrace_start; +static int ftrace_start_up; static DEFINE_MUTEX(ftrace_start_lock); static void ftrace_startup(void) @@ -537,8 +564,8 @@ static void ftrace_startup(void) return; mutex_lock(&ftrace_start_lock); - ftrace_start++; - if (ftrace_start == 1) + ftrace_start_up++; + if (ftrace_start_up == 1) command |= FTRACE_ENABLE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { @@ -562,8 +589,8 @@ static void ftrace_shutdown(void) return; mutex_lock(&ftrace_start_lock); - ftrace_start--; - if (!ftrace_start) + ftrace_start_up--; + if (!ftrace_start_up) command |= FTRACE_DISABLE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { @@ -589,8 +616,8 @@ static void ftrace_startup_sysctl(void) mutex_lock(&ftrace_start_lock); /* Force update next time */ saved_ftrace_func = NULL; - /* ftrace_start is true if we want ftrace running */ - if (ftrace_start) + /* ftrace_start_up is true if we want ftrace running */ + if (ftrace_start_up) command |= FTRACE_ENABLE_CALLS; ftrace_run_update_code(command); @@ -605,8 +632,8 @@ static void ftrace_shutdown_sysctl(void) return; mutex_lock(&ftrace_start_lock); - /* ftrace_start is true if ftrace is running */ - if (ftrace_start) + /* ftrace_start_up is true if ftrace is running */ + if (ftrace_start_up) command |= FTRACE_DISABLE_CALLS; ftrace_run_update_code(command); @@ -1186,7 +1213,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftrace_start_lock); - if (iter->filtered && ftrace_start && ftrace_enabled) + if (iter->filtered && ftrace_start_up && ftrace_enabled) ftrace_run_update_code(FTRACE_ENABLE_CALLS); mutex_unlock(&ftrace_start_lock); mutex_unlock(&ftrace_sysctl_lock); -- cgit v1.2.3-70-g09d2 From da85f865b1dcec0853c48b763ed312441ce0c7df Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 5 Nov 2008 13:37:27 -0600 Subject: x86: mention ACPI in top-level Kconfig menu Impact: clarify menuconfig text Mention ACPI in the top-level menu to give a clue as to where it lives. This matches what ia64 does. Signed-off-by: Bjorn Helgaas Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6f20718d315..5d6aa4013dc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1494,7 +1494,7 @@ config HAVE_ARCH_EARLY_PFN_TO_NID def_bool X86_64 depends on NUMA -menu "Power management options" +menu "Power management and ACPI options" depends on !X86_VOYAGER config ARCH_HIBERNATION_HEADER -- cgit v1.2.3-70-g09d2 From fd51b2d7d5df932767b89e00d0871a38a2c53e74 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 5 Nov 2008 02:27:19 +0900 Subject: x86: update CONFIG_NUMA description Impact: clarify/update CONFIG_NUMA text CONFIG_NUMA description talk about a bit old thing. So, following changes are better. o CONFIG_NUMA is no longer EXPERIMENTAL o Opteron is not the only processor of NUMA topology on x86_64 no longer, but also Intel Core7i has it. Signed-off-by: KOSAKI Motohiro Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 350bee1d54d..38ae04bf651 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -951,22 +951,26 @@ config ARCH_PHYS_ADDR_T_64BIT # Common NUMA Features config NUMA - bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" + bool "Numa Memory Allocation and Scheduler Support" depends on SMP depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) help Enable NUMA (Non Uniform Memory Access) support. + The kernel will try to allocate memory used by a CPU on the local memory controller of the CPU and add some more NUMA awareness to the kernel. - For 32-bit this is currently highly experimental and should be only - used for kernel development. It might also cause boot failures. - For 64-bit this is recommended on all multiprocessor Opteron systems. - If the system is EM64T, you should say N unless your system is - EM64T NUMA. + For 64-bit this is recommended if the system is Intel Core 7i + (or later), AMD Opteron, or EM64T NUMA. + + For 32-bit this is only needed on (rare) 32-bit-only platforms + that support NUMA topologies, such as NUMAQ / Summit, or if you + boot a 32-bit kernel on a 64-bit NUMA platform. + + Otherwise, you should say N. comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) -- cgit v1.2.3-70-g09d2 From a87d091434ed2a34d647979ab12084139ee1fe41 Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Thu, 6 Nov 2008 11:10:49 -0800 Subject: x86, sched: enable wchan config menu item on 64-bit Enable the wchan config menu item for now on x86-64 arch? This will at least allow people to enable/disable frame pointers on scheduler functions. Signed-off-by: Ken Chen Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6f20718d315..488a4ecd0b5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -367,7 +367,7 @@ config X86_RDC321X config SCHED_NO_NO_OMIT_FRAME_POINTER def_bool y prompt "Single-depth WCHAN output" - depends on X86_32 + depends on X86 help Calculate simpler /proc//wchan values. If this option is disabled then wchan values will recurse back to the -- cgit v1.2.3-70-g09d2 From 4694516d1987303dd83bfd0efdd36fa5b65d701b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 10 Nov 2008 21:52:47 +0100 Subject: x86: Make NUMA on 32-bit depend on BROKEN While investigating the failure of hibernation on 32-bit x86 with CONFIG_NUMA set, as described in this message http://marc.info/?l=linux-kernel&m=122634118116226&w=4 I asked some people for help and I was told that it wasn't really worth the effort, because CONFIG_NUMA was generally broken on 32-bit x86 systems and it shouldn't be used in such configs. For this reason, make CONFIG_NUMA depend on BROKEN instead of EXPERIMENTAL on x86-32. Signed-off-by: Rafael J. Wysocki Cc: Andi Kleen Cc: Pavel Machek Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4cf0ab13d18..93224b56918 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -957,7 +957,7 @@ config ARCH_PHYS_ADDR_T_64BIT config NUMA bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" depends on SMP - depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) + depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && BROKEN) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) help -- cgit v1.2.3-70-g09d2 From ae1e9130bfb9ad55eb97ec3fb17a122b7a118f98 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Nov 2008 09:05:16 +0100 Subject: sched: rename SCHED_NO_NO_OMIT_FRAME_POINTER => SCHED_OMIT_FRAME_POINTER Impact: cleanup, change .config option name We had this ugly config name for a long time for hysteric raisons. Rename it to a saner name. We still cannot get rid of it completely, until /proc//stack usage replaces WCHAN usage for good. We'll be able to do that in the v2.6.29/v2.6.30 timeframe. Signed-off-by: Ingo Molnar --- arch/ia64/Kconfig | 2 +- arch/m32r/Kconfig | 2 +- arch/mips/Kconfig | 2 +- arch/powerpc/Kconfig | 2 +- arch/x86/Kconfig | 2 +- include/asm-m32r/system.h | 2 +- kernel/Makefile | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 27eec71429b..59d12788b60 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -99,7 +99,7 @@ config GENERIC_IOMAP bool default y -config SCHED_NO_NO_OMIT_FRAME_POINTER +config SCHED_OMIT_FRAME_POINTER bool default y diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index dbaed4a6381..29047d5c259 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -273,7 +273,7 @@ config GENERIC_CALIBRATE_DELAY bool default y -config SCHED_NO_NO_OMIT_FRAME_POINTER +config SCHED_OMIT_FRAME_POINTER bool default y diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index f4af967a6b3..a5255e7c79e 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -653,7 +653,7 @@ config GENERIC_CMOS_UPDATE bool default y -config SCHED_NO_NO_OMIT_FRAME_POINTER +config SCHED_OMIT_FRAME_POINTER bool default y diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 525c13a4de9..adb23ea1c1e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -141,7 +141,7 @@ config GENERIC_NVRAM bool default y if PPC32 -config SCHED_NO_NO_OMIT_FRAME_POINTER +config SCHED_OMIT_FRAME_POINTER bool default y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d5550d19b6..74db682ec1c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -364,7 +364,7 @@ config X86_RDC321X as R-8610-(G). If you don't have one of these chips, you should say N here. -config SCHED_NO_NO_OMIT_FRAME_POINTER +config SCHED_OMIT_FRAME_POINTER def_bool y prompt "Single-depth WCHAN output" depends on X86 diff --git a/include/asm-m32r/system.h b/include/asm-m32r/system.h index 70a57c8c002..c980f5ba8de 100644 --- a/include/asm-m32r/system.h +++ b/include/asm-m32r/system.h @@ -23,7 +23,7 @@ */ #if defined(CONFIG_FRAME_POINTER) || \ - !defined(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER) + !defined(CONFIG_SCHED_OMIT_FRAME_POINTER) #define M32R_PUSH_FP " push fp\n" #define M32R_POP_FP " pop fp\n" #else diff --git a/kernel/Makefile b/kernel/Makefile index e1af0397214..46e67a39849 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -91,7 +91,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o -ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) +ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond # me. I suspect most platforms don't need this, but until we know that for sure -- cgit v1.2.3-70-g09d2 From caf4b323b02a16c92fba449952ac6515ddc76d7a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 11 Nov 2008 07:03:45 +0100 Subject: tracing, x86: add low level support for ftrace return tracing Impact: add infrastructure for function-return tracing Add low level support for ftrace return tracing. This plug-in stores return addresses on the thread_info structure of the current task. The index of the current return address is initialized when the task is the first one (init) and when a process forks (the child). It is not needed when a task does a sys_execve because after this syscall, it still needs to return on the kernel functions it called. Note that the code of return_to_handler has been suggested by Steven Rostedt as almost all of the ideas of improvements in this V3. For purpose of security, arch/x86/kernel/process_32.c is not traced because __switch_to() changes the current task during its execution. That could cause inconsistency in the stored return address of this function even if I didn't have any crash after testing with tracing on this function enabled. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/include/asm/ftrace.h | 26 ++++++ arch/x86/include/asm/thread_info.h | 24 +++++ arch/x86/kernel/Makefile | 6 ++ arch/x86/kernel/entry_32.S | 33 +++++++ arch/x86/kernel/ftrace.c | 181 +++++++++++++++++++++++++++++++++++-- include/linux/ftrace.h | 20 ++++ include/linux/ftrace_irq.h | 2 +- include/linux/sched.h | 11 +++ kernel/Makefile | 4 + 10 files changed, 300 insertions(+), 8 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 27b8a3a3991..ca91e50bdb1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -11,6 +11,7 @@ config 64BIT config X86_32 def_bool !64BIT + select HAVE_FUNCTION_RET_TRACER config X86_64 def_bool 64BIT diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index f8173ed1c97..9b6a1fa19e7 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -20,4 +20,30 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_RET_TRACER +#define FTRACE_RET_STACK_SIZE 20 + +#ifndef __ASSEMBLY__ + +/* + * Stack of return addresses for functions + * of a thread. + * Used in struct thread_info + */ +struct ftrace_ret_stack { + unsigned long ret; + unsigned long func; + unsigned long long calltime; +}; + +/* + * Primary handler of a function return. + * It relays on ftrace_return_to_handler. + * Defined in entry32.S + */ +extern void return_to_handler(void); + +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_FUNCTION_RET_TRACER */ + #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e44d379faad..a71158369fd 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -20,6 +20,7 @@ struct task_struct; struct exec_domain; #include +#include struct thread_info { struct task_struct *task; /* main task structure */ @@ -38,8 +39,30 @@ struct thread_info { */ __u8 supervisor_stack[0]; #endif + +#ifdef CONFIG_FUNCTION_RET_TRACER + /* Index of current stored adress in ret_stack */ + int curr_ret_stack; + /* Stack of return addresses for return function tracing */ + struct ftrace_ret_stack ret_stack[FTRACE_RET_STACK_SIZE]; +#endif }; +#ifdef CONFIG_FUNCTION_RET_TRACER +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + }, \ + .curr_ret_stack = -1,\ +} +#else #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ @@ -52,6 +75,7 @@ struct thread_info { .fn = do_no_restart_syscall, \ }, \ } +#endif #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index e489ff9cb3e..1d8ed95da84 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -14,6 +14,11 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_ftrace.o = -pg endif +ifdef CONFIG_FUNCTION_RET_TRACER +# Don't trace __switch_to() but let it for function tracer +CFLAGS_REMOVE_process_32.o = -pg +endif + # # vsyscalls (which work on the user stack) should have # no stack-protector checks: @@ -65,6 +70,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_FUNCTION_RET_TRACER) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 9134de814c9..9a0ac85946d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1188,6 +1188,10 @@ ENTRY(mcount) cmpl $ftrace_stub, ftrace_trace_function jnz trace +#ifdef CONFIG_FUNCTION_RET_TRACER + cmpl $ftrace_stub, ftrace_function_return + jnz trace_return +#endif .globl ftrace_stub ftrace_stub: ret @@ -1206,8 +1210,37 @@ trace: popl %edx popl %ecx popl %eax + jmp ftrace_stub +#ifdef CONFIG_FUNCTION_RET_TRACER +trace_return: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + pushl %eax + lea 0x4(%ebp), %eax + pushl %eax + call prepare_ftrace_return + addl $8, %esp + popl %edx + popl %ecx + popl %eax jmp ftrace_stub + +.globl return_to_handler +return_to_handler: + pushl $0 + pushl %eax + pushl %ecx + pushl %edx + call ftrace_return_to_handler + movl %eax, 0xc(%esp) + popl %edx + popl %ecx + popl %eax + ret +#endif /* CONFIG_FUNCTION_RET_TRACER */ END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 69149337f2f..d68033bba22 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -18,10 +18,173 @@ #include #include +#include #include +#include -static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; + +#ifdef CONFIG_FUNCTION_RET_TRACER + +/* + * These functions are picked from those used on + * this page for dynamic ftrace. They have been + * simplified to ignore all traces in NMI context. + */ +static atomic_t in_nmi; + +void ftrace_nmi_enter(void) +{ + atomic_inc(&in_nmi); +} + +void ftrace_nmi_exit(void) +{ + atomic_dec(&in_nmi); +} + +/* + * Synchronize accesses to return adresses stack with + * interrupts. + */ +static raw_spinlock_t ret_stack_lock; + +/* Add a function return address to the trace stack on thread info.*/ +static int push_return_trace(unsigned long ret, unsigned long long time, + unsigned long func) +{ + int index; + struct thread_info *ti; + unsigned long flags; + int err = 0; + + raw_local_irq_save(flags); + __raw_spin_lock(&ret_stack_lock); + + ti = current_thread_info(); + /* The return trace stack is full */ + if (ti->curr_ret_stack == FTRACE_RET_STACK_SIZE - 1) { + err = -EBUSY; + goto out; + } + + index = ++ti->curr_ret_stack; + ti->ret_stack[index].ret = ret; + ti->ret_stack[index].func = func; + ti->ret_stack[index].calltime = time; + +out: + __raw_spin_unlock(&ret_stack_lock); + raw_local_irq_restore(flags); + return err; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +static void pop_return_trace(unsigned long *ret, unsigned long long *time, + unsigned long *func) +{ + struct thread_info *ti; + int index; + unsigned long flags; + + raw_local_irq_save(flags); + __raw_spin_lock(&ret_stack_lock); + + ti = current_thread_info(); + index = ti->curr_ret_stack; + *ret = ti->ret_stack[index].ret; + *func = ti->ret_stack[index].func; + *time = ti->ret_stack[index].calltime; + ti->curr_ret_stack--; + + __raw_spin_unlock(&ret_stack_lock); + raw_local_irq_restore(flags); +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(void) +{ + struct ftrace_retfunc trace; + pop_return_trace(&trace.ret, &trace.calltime, &trace.func); + trace.rettime = cpu_clock(raw_smp_processor_id()); + ftrace_function_return(&trace); + + return trace.ret; +} + +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +asmlinkage +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) +{ + unsigned long old; + unsigned long long calltime; + int faulted; + unsigned long return_hooker = (unsigned long) + &return_to_handler; + + /* Nmi's are currently unsupported */ + if (atomic_read(&in_nmi)) + return; + + /* + * Protect against fault, even if it shouldn't + * happen. This tool is too much intrusive to + * ignore such a protection. + */ + asm volatile( + "1: movl (%[parent_old]), %[old]\n" + "2: movl %[return_hooker], (%[parent_replaced])\n" + " movl $0, %[faulted]\n" + + ".section .fixup, \"ax\"\n" + "3: movl $1, %[faulted]\n" + ".previous\n" + + ".section __ex_table, \"a\"\n" + " .long 1b, 3b\n" + " .long 2b, 3b\n" + ".previous\n" + + : [parent_replaced] "=rm" (parent), [old] "=r" (old), + [faulted] "=r" (faulted) + : [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) + : "memory" + ); + + if (WARN_ON(faulted)) { + unregister_ftrace_return(); + return; + } + + if (WARN_ON(!__kernel_text_address(old))) { + unregister_ftrace_return(); + *parent = old; + return; + } + + calltime = cpu_clock(raw_smp_processor_id()); + + if (push_return_trace(old, calltime, self_addr) == -EBUSY) + *parent = old; +} + +static int __init init_ftrace_function_return(void) +{ + ret_stack_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + return 0; +} +device_initcall(init_ftrace_function_return); + + +#endif + +#ifdef CONFIG_DYNAMIC_FTRACE union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; @@ -31,17 +194,11 @@ union ftrace_code_union { } __attribute__((packed)); }; - static int ftrace_calc_offset(long ip, long addr) { return (int)(addr - ip); } -unsigned char *ftrace_nop_replace(void) -{ - return ftrace_nop; -} - unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) { static union ftrace_code_union calc; @@ -183,6 +340,15 @@ do_ftrace_mod_code(unsigned long ip, void *new_code) } + + +static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; + +unsigned char *ftrace_nop_replace(void) +{ + return ftrace_nop; +} + int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) @@ -292,3 +458,4 @@ int __init ftrace_dyn_arch_init(void *data) return 0; } +#endif diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1f5608c1102..dcbbf72a88b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -267,6 +267,26 @@ ftrace_init_module(unsigned long *start, unsigned long *end) { } #endif +/* + * Structure that defines a return function trace. + */ +struct ftrace_retfunc { + unsigned long ret; /* Return address */ + unsigned long func; /* Current function */ + unsigned long long calltime; + unsigned long long rettime; +}; + +#ifdef CONFIG_FUNCTION_RET_TRACER +/* Type of a callback handler of tracing return function */ +typedef void (*trace_function_return_t)(struct ftrace_retfunc *); + +extern void register_ftrace_return(trace_function_return_t func); +/* The current handler in use */ +extern trace_function_return_t ftrace_function_return; +extern void unregister_ftrace_return(void); +#endif + /* * Structure which defines the trace of an initcall. * You don't have to fill the func field since it is diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index b1299d6729f..0b4df55d7a7 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -2,7 +2,7 @@ #define _LINUX_FTRACE_IRQ_H -#ifdef CONFIG_DYNAMIC_FTRACE +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_RET_TRACER) extern void ftrace_nmi_enter(void); extern void ftrace_nmi_exit(void); #else diff --git a/include/linux/sched.h b/include/linux/sched.h index 295b7c756ca..df77abe860c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2005,6 +2005,17 @@ static inline void setup_thread_stack(struct task_struct *p, struct task_struct { *task_thread_info(p) = *task_thread_info(org); task_thread_info(p)->task = p; + +#ifdef CONFIG_FUNCTION_RET_TRACER + /* + * When fork() creates a child process, this function is called. + * But the child task may not inherit the return adresses traced + * by the return function tracer because it will directly execute + * in userspace and will not return to kernel functions its parent + * used. + */ + task_thread_info(p)->curr_ret_stack = -1; +#endif } static inline unsigned long *end_of_stack(struct task_struct *p) diff --git a/kernel/Makefile b/kernel/Makefile index 9a3ec66a9d8..af3be57acbb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,6 +23,10 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_sched.o = -mno-spe -pg endif +ifdef CONFIG_FUNCTION_RET_TRACER +CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address() +CFLAGS_REMOVE_module.o = -pg # For __module_text_address() +endif obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o -- cgit v1.2.3-70-g09d2 From f1c4be5edad3756212cbbbeab39428fe90c27109 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Nov 2008 10:22:36 +0100 Subject: tracing, x86: clean up FUNCTION_RET_TRACER Kconfig Impact: cleanup move FUNCTION_RET_TRACER to the X86 select section, where we have all the other options. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ca91e50bdb1..0de793cf214 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -11,7 +11,6 @@ config 64BIT config X86_32 def_bool !64BIT - select HAVE_FUNCTION_RET_TRACER config X86_64 def_bool 64BIT @@ -30,6 +29,7 @@ config X86 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER + select HAVE_FUNCTION_RET_TRACER if X86_32 select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER -- cgit v1.2.3-70-g09d2 From 6cd10f8db385ba547811baa5b26f672fdff232e6 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Sun, 9 Nov 2008 11:53:14 -0600 Subject: x86, voyager: fix smp generic helper voyager breakage Impact: build/boot fix for x86/Voyager This change: | commit 3d4422332711ef48ef0f132f1fcbfcbd56c7f3d1 | Author: Jens Axboe | Date: Thu Jun 26 11:21:34 2008 +0200 | | Add generic helpers for arch IPI function calls didn't wire up the voyager smp call function correctly, so do that here. Also make CONFIG_USE_GENERIC_SMP_HELPERS a def_bool y again, since we now use the generic helpers for every x86 architecture. Signed-off-by: James Bottomley Cc: Jens Axboe Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 5 ++++- arch/x86/mach-voyager/voyager_smp.c | 16 ++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4cf0ab13d18..ac22bb7719f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -167,9 +167,12 @@ config GENERIC_PENDING_IRQ config X86_SMP bool depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64) - select USE_GENERIC_SMP_HELPERS default y +config USE_GENERIC_SMP_HELPERS + def_bool y + depends on SMP + config X86_32_SMP def_bool y depends on X86_32 && SMP diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 0e331652681..52145007bd7 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -7,6 +7,7 @@ * This file provides all the same external entries as smp.c but uses * the voyager hal to provide the functionality */ +#include #include #include #include @@ -1790,6 +1791,17 @@ void __init smp_setup_processor_id(void) x86_write_percpu(cpu_number, hard_smp_processor_id()); } +static void voyager_send_call_func(cpumask_t callmask) +{ + __u32 mask = cpus_addr(callmask)[0] & ~(1 << smp_processor_id()); + send_CPI(mask, VIC_CALL_FUNCTION_CPI); +} + +static void voyager_send_call_func_single(int cpu) +{ + send_CPI(1 << cpu, VIC_CALL_FUNCTION_SINGLE_CPI); +} + struct smp_ops smp_ops = { .smp_prepare_boot_cpu = voyager_smp_prepare_boot_cpu, .smp_prepare_cpus = voyager_smp_prepare_cpus, @@ -1799,6 +1811,6 @@ struct smp_ops smp_ops = { .smp_send_stop = voyager_smp_send_stop, .smp_send_reschedule = voyager_smp_send_reschedule, - .send_call_func_ipi = native_send_call_func_ipi, - .send_call_func_single_ipi = native_send_call_func_single_ipi, + .send_call_func_ipi = voyager_send_call_func, + .send_call_func_single_ipi = voyager_send_call_func_single, }; -- cgit v1.2.3-70-g09d2 From c280ea5e4c6ba0b38ed6b005150fe16a660e903b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 8 Nov 2008 13:29:45 +0100 Subject: x86: fix documentation typo in arch/x86/Kconfig Impact: documentation update Chris Snook pointed out that it's Core i7, not Core 7i. Reported-by: Chris Snook Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 38ae04bf651..bacac556b18 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -963,7 +963,7 @@ config NUMA local memory controller of the CPU and add some more NUMA awareness to the kernel. - For 64-bit this is recommended if the system is Intel Core 7i + For 64-bit this is recommended if the system is Intel Core i7 (or later), AMD Opteron, or EM64T NUMA. For 32-bit this is only needed on (rare) 32-bit-only platforms -- cgit v1.2.3-70-g09d2 From 604d20554883cf03f888440d58ea7c6d36899839 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Nov 2008 23:26:14 +0100 Subject: x86: make NUMA on 32-bit depend on EXPERIMENTAL again My previous patch to make CONFIG_NUMA on x86_32 depend on BROKEN turned out to be unnecessary, after all, since the source of the hibernation vs CONFIG_NUMA problem turned out to be the fact that we didn't take the NUMA KVA remapping into account in the hibernation code. Signed-off-by: Rafael J. Wysocki Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 93224b56918..4cf0ab13d18 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -957,7 +957,7 @@ config ARCH_PHYS_ADDR_T_64BIT config NUMA bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" depends on SMP - depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && BROKEN) + depends on X86_64 || (X86_32 && HIGHMEM64G && (X86_NUMAQ || X86_BIGSMP || X86_SUMMIT && ACPI) && EXPERIMENTAL) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP) help -- cgit v1.2.3-70-g09d2 From a1afd01c175324656d0e8f1c82ea94b474953c04 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 18 Nov 2008 12:44:21 +0100 Subject: x86: default to SWIOTLB=y on x86_64 Impact: fixes korg bugzilla 11980 A kernel for a 64bit x86 system should always contain the swiotlb code in case it is booted on a machine without any hardware IOMMU supported by the kernel and more than 4GB of RAM. This patch changes Kconfig to always compile swiotlb into the kernel for x86_64. Signed-off-by: Joerg Roedel Cc: stable@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 93224b56918..669c6d588bd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -566,7 +566,7 @@ config AMD_IOMMU # need this always selected by IOMMU for the VIA workaround config SWIOTLB - bool + def_bool y if X86_64 help Support for software bounce buffers used on x86-64 systems which don't have a hardware IOMMU (e.g. the current generation -- cgit v1.2.3-70-g09d2 From b5fe363b7d89577fcfda9b6cf0efc32760bbccc6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 18 Nov 2008 08:14:14 -0800 Subject: x86: use update_genapic to get rid of ES7000_CLUSTERED_APIC v2 Impact: clean up We can autodetect those system that need cluster apic, and update genapic accordingly. We can also remove wakeup.h for e7000, because it's default one is now the same as overall default mach_wakecpu.h Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 -- arch/x86/include/asm/es7000/apic.h | 76 ++++++++++++++++++++++++++------------ arch/x86/include/asm/genapic_32.h | 1 + arch/x86/kernel/es7000_32.c | 17 +++++++-- arch/x86/mach-generic/es7000.c | 14 ++++++- 5 files changed, 80 insertions(+), 32 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 93224b56918..7d0ab8942cf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -462,10 +462,6 @@ config X86_CYCLONE_TIMER def_bool y depends on X86_GENERICARCH -config ES7000_CLUSTERED_APIC - def_bool y - depends on SMP && X86_ES7000 && MPENTIUMIII - source "arch/x86/Kconfig.cpu" config HPET_TIMER diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h index 9d8cf776c28..e24ef876915 100644 --- a/arch/x86/include/asm/es7000/apic.h +++ b/arch/x86/include/asm/es7000/apic.h @@ -9,28 +9,27 @@ static inline int apic_id_registered(void) return (1); } -static inline cpumask_t target_cpus(void) +static inline cpumask_t target_cpus_cluster(void) { -#if defined CONFIG_ES7000_CLUSTERED_APIC return CPU_MASK_ALL; -#else +} + +static inline cpumask_t target_cpus(void) +{ return cpumask_of_cpu(smp_processor_id()); -#endif } -#if defined CONFIG_ES7000_CLUSTERED_APIC -#define APIC_DFR_VALUE (APIC_DFR_CLUSTER) -#define INT_DELIVERY_MODE (dest_LowestPrio) -#define INT_DEST_MODE (1) /* logical delivery broadcast to all procs */ -#define NO_BALANCE_IRQ (1) -#else +#define APIC_DFR_VALUE_CLUSTER (APIC_DFR_CLUSTER) +#define INT_DELIVERY_MODE_CLUSTER (dest_LowestPrio) +#define INT_DEST_MODE_CLUSTER (1) /* logical delivery broadcast to all procs */ +#define NO_BALANCE_IRQ_CLUSTER (1) + #define APIC_DFR_VALUE (APIC_DFR_FLAT) #define INT_DELIVERY_MODE (dest_Fixed) #define INT_DEST_MODE (0) /* phys delivery to target procs */ #define NO_BALANCE_IRQ (0) #undef APIC_DEST_LOGICAL #define APIC_DEST_LOGICAL 0x0 -#endif static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) { @@ -57,6 +56,16 @@ static inline unsigned long calculate_ldr(int cpu) * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel * document number 292116). So here it goes... */ +static inline void init_apic_ldr_cluster(void) +{ + unsigned long val; + int cpu = smp_processor_id(); + + apic_write(APIC_DFR, APIC_DFR_VALUE_CLUSTER); + val = calculate_ldr(cpu); + apic_write(APIC_LDR, val); +} + static inline void init_apic_ldr(void) { unsigned long val; @@ -67,10 +76,6 @@ static inline void init_apic_ldr(void) apic_write(APIC_LDR, val); } -#ifndef CONFIG_X86_GENERICARCH -extern void enable_apic_mode(void); -#endif - extern int apic_version [MAX_APICS]; static inline void setup_apic_routing(void) { @@ -141,7 +146,7 @@ static inline int check_phys_apicid_present(int cpu_physical_apicid) return (1); } -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask) { int num_bits_set; int cpus_found = 0; @@ -151,11 +156,7 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) num_bits_set = cpus_weight(cpumask); /* Return id to all */ if (num_bits_set == NR_CPUS) -#if defined CONFIG_ES7000_CLUSTERED_APIC return 0xFF; -#else - return cpu_to_logical_apicid(0); -#endif /* * The cpus in the mask must all be on the apic cluster. If are not * on the same apicid cluster return default value of TARGET_CPUS. @@ -168,11 +169,40 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) if (apicid_cluster(apicid) != apicid_cluster(new_apicid)){ printk ("%s: Not a valid mask!\n", __func__); -#if defined CONFIG_ES7000_CLUSTERED_APIC return 0xFF; -#else + } + apicid = new_apicid; + cpus_found++; + } + cpu++; + } + return apicid; +} + +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +{ + int num_bits_set; + int cpus_found = 0; + int cpu; + int apicid; + + num_bits_set = cpus_weight(cpumask); + /* Return id to all */ + if (num_bits_set == NR_CPUS) + return cpu_to_logical_apicid(0); + /* + * The cpus in the mask must all be on the apic cluster. If are not + * on the same apicid cluster return default value of TARGET_CPUS. + */ + cpu = first_cpu(cpumask); + apicid = cpu_to_logical_apicid(cpu); + while (cpus_found < num_bits_set) { + if (cpu_isset(cpu, cpumask)) { + int new_apicid = cpu_to_logical_apicid(cpu); + if (apicid_cluster(apicid) != + apicid_cluster(new_apicid)){ + printk ("%s: Not a valid mask!\n", __func__); return cpu_to_logical_apicid(0); -#endif } apicid = new_apicid; cpus_found++; diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h index 455d6c27a98..0ac17d33a8c 100644 --- a/arch/x86/include/asm/genapic_32.h +++ b/arch/x86/include/asm/genapic_32.h @@ -131,6 +131,7 @@ struct genapic { } extern struct genapic *genapic; +extern void es7000_update_genapic_to_cluster(void); enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; #define get_uv_system_type() UV_NONE diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index fb3bfe66fbe..71d7be624d4 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -163,7 +164,6 @@ es7000_rename_gsi(int ioapic, int gsi) return gsi; } -#ifdef CONFIG_ES7000_CLUSTERED_APIC static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { unsigned long vect = 0, psaival = 0; @@ -182,13 +182,24 @@ static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) return 0; } +static void noop_wait_for_deassert(atomic_t *deassert_not_used) +{ +} + static int __init es7000_update_genapic(void) { genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; + /* MPENTIUMIII */ + if (boot_cpu_data.x86 == 6 && + (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) { + es7000_update_genapic_to_cluster(); + genapic->wait_for_init_deassert = noop_wait_for_deassert; + genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; + } + return 0; } -#endif void __init setup_unisys(void) @@ -206,9 +217,7 @@ setup_unisys(void) es7000_plat = ES7000_CLASSIC; ioapic_renumber_irq = es7000_rename_gsi; -#ifdef CONFIG_ES7000_CLUSTERED_APIC x86_quirks->update_genapic = es7000_update_genapic; -#endif } /* diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 28459cab3dd..7b4e6d0d169 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -16,7 +16,19 @@ #include #include #include -#include +#include + +void __init es7000_update_genapic_to_cluster(void) +{ + genapic->target_cpus = target_cpus_cluster; + genapic->int_delivery_mode = INT_DELIVERY_MODE_CLUSTER; + genapic->int_dest_mode = INT_DEST_MODE_CLUSTER; + genapic->no_balance_irq = NO_BALANCE_IRQ_CLUSTER; + + genapic->init_apic_ldr = init_apic_ldr_cluster; + + genapic->cpu_mask_to_apicid = cpu_mask_to_apicid_cluster; +} static int probe_es7000(void) { -- cgit v1.2.3-70-g09d2 From 8d26487fd4ddda7a0237da418fb8669fb06ae557 Mon Sep 17 00:00:00 2001 From: Török Edwin Date: Sun, 23 Nov 2008 12:39:08 +0200 Subject: tracing/stack-tracer: introduce CONFIG_USER_STACKTRACE_SUPPORT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: cleanup User stack tracing is just implemented for x86, but it is not x86 specific. Introduce a generic config flag, that is currently enabled only for x86. When other arches implement it, they will have to SELECT USER_STACKTRACE_SUPPORT. Signed-off-by: Török Edwin Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + include/linux/stacktrace.h | 2 +- kernel/trace/Kconfig | 3 +++ 3 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7a146baaa99..e49a4fd718f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -36,6 +36,7 @@ config X86 select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS + select USER_STACKTRACE_SUPPORT config ARCH_DEFCONFIG string diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index fd42d685110..1a8cecc4f38 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -16,7 +16,7 @@ extern void save_stack_trace_tsk(struct task_struct *tsk, extern void print_stack_trace(struct stack_trace *trace, int spaces); -#ifdef CONFIG_X86 +#ifdef CONFIG_USER_STACKTRACE_SUPPORT extern void save_stack_trace_user(struct stack_trace *trace); #else # define save_stack_trace_user(trace) do { } while (0) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b8378fad29a..87fc34a1bb9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -3,6 +3,9 @@ # select HAVE_FUNCTION_TRACER: # +config USER_STACKTRACE_SUPPORT + bool + config NOP_TRACER bool -- cgit v1.2.3-70-g09d2 From e45f2c07742d447597df001c878bc4a8aafcde37 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Mon, 24 Nov 2008 11:28:36 +0300 Subject: x86: correct link to HPET timer specification Impact: update documentation / help text Original link is dead. Signed-off-by: Denis V. Lunev Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- drivers/char/hpet.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f..19f0d97829e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -482,7 +482,7 @@ config HPET_TIMER The HPET provides a stable time base on SMP systems, unlike the TSC, but it is more expensive to access, as it is off-chip. You can find the HPET spec at - . + . You can safely choose Y here. However, HPET will only be activated if the platform and the BIOS support this feature. diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 53fdc7ff387..32b8bbf5003 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -46,7 +46,7 @@ /* * The High Precision Event Timer driver. * This driver is closely modelled after the rtc.c driver. - * http://www.intel.com/hardwaredesign/hpetspec.htm + * http://www.intel.com/hardwaredesign/hpetspec_1.pdf */ #define HPET_USER_FREQ (64) #define HPET_DRIFT (500) -- cgit v1.2.3-70-g09d2 From fb52607afcd0629776f1dc9e657647ceae81dd50 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 25 Nov 2008 21:07:04 +0100 Subject: tracing/function-return-tracer: change the name into function-graph-tracer Impact: cleanup This patch changes the name of the "return function tracer" into function-graph-tracer which is a more suitable name for a tracing which makes one able to retrieve the ordered call stack during the code flow. Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/ftrace.h | 4 +- arch/x86/kernel/Makefile | 4 +- arch/x86/kernel/entry_32.S | 12 ++--- arch/x86/kernel/ftrace.c | 12 ++--- include/linux/ftrace.h | 24 ++++----- include/linux/ftrace_irq.h | 2 +- include/linux/sched.h | 2 +- kernel/Makefile | 2 +- kernel/fork.c | 4 +- kernel/sched.c | 2 +- kernel/trace/Kconfig | 19 ++++--- kernel/trace/Makefile | 2 +- kernel/trace/ftrace.c | 26 +++++----- kernel/trace/trace.c | 18 +++---- kernel/trace/trace.h | 12 ++--- kernel/trace/trace_functions_graph.c | 98 ++++++++++++++++++++++++++++++++++++ 17 files changed, 173 insertions(+), 72 deletions(-) create mode 100644 kernel/trace/trace_functions_graph.c (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e49a4fd718f..0842b112768 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -29,7 +29,7 @@ config X86 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER - select HAVE_FUNCTION_RET_TRACER if X86_32 + select HAVE_FUNCTION_GRAPH_TRACER if X86_32 select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 754a3e082f9..7e61b4ceb9a 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -28,7 +28,7 @@ struct dyn_arch_ftrace { #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER #ifndef __ASSEMBLY__ @@ -51,6 +51,6 @@ struct ftrace_ret_stack { extern void return_to_handler(void); #endif /* __ASSEMBLY__ */ -#endif /* CONFIG_FUNCTION_RET_TRACER */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index af2bc36ca1c..64939a0c398 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -14,7 +14,7 @@ CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_ftrace.o = -pg endif -ifdef CONFIG_FUNCTION_RET_TRACER +ifdef CONFIG_FUNCTION_GRAPH_TRACER # Don't trace __switch_to() but let it for function tracer CFLAGS_REMOVE_process_32.o = -pg endif @@ -70,7 +70,7 @@ obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_FUNCTION_RET_TRACER) += ftrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 74defe21ba4..2b1f0f081a6 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1188,9 +1188,9 @@ ENTRY(mcount) cmpl $ftrace_stub, ftrace_trace_function jnz trace -#ifdef CONFIG_FUNCTION_RET_TRACER - cmpl $ftrace_stub, ftrace_function_return - jnz ftrace_return_caller +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_function + jnz ftrace_graph_caller #endif .globl ftrace_stub ftrace_stub: @@ -1215,8 +1215,8 @@ END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ -#ifdef CONFIG_FUNCTION_RET_TRACER -ENTRY(ftrace_return_caller) +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) cmpl $0, function_trace_stop jne ftrace_stub @@ -1230,7 +1230,7 @@ ENTRY(ftrace_return_caller) popl %ecx popl %eax ret -END(ftrace_return_caller) +END(ftrace_graph_caller) .globl return_to_handler return_to_handler: diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index bb137f7297e..3595a4c14ab 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -323,7 +323,7 @@ int __init ftrace_dyn_arch_init(void *data) } #endif -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER #ifndef CONFIG_DYNAMIC_FTRACE @@ -389,11 +389,11 @@ static void pop_return_trace(unsigned long *ret, unsigned long long *time, */ unsigned long ftrace_return_to_handler(void) { - struct ftrace_retfunc trace; + struct ftrace_graph_ret trace; pop_return_trace(&trace.ret, &trace.calltime, &trace.func, &trace.overrun); trace.rettime = cpu_clock(raw_smp_processor_id()); - ftrace_function_return(&trace); + ftrace_graph_function(&trace); return trace.ret; } @@ -440,12 +440,12 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) ); if (WARN_ON(faulted)) { - unregister_ftrace_return(); + unregister_ftrace_graph(); return; } if (WARN_ON(!__kernel_text_address(old))) { - unregister_ftrace_return(); + unregister_ftrace_graph(); *parent = old; return; } @@ -456,4 +456,4 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) *parent = old; } -#endif /* CONFIG_FUNCTION_RET_TRACER */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7854d87b97b..b4ac734ad8d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -115,8 +115,8 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func); extern void ftrace_caller(void); extern void ftrace_call(void); extern void mcount_call(void); -#ifdef CONFIG_FUNCTION_RET_TRACER -extern void ftrace_return_caller(void); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +extern void ftrace_graph_caller(void); #endif /** @@ -315,7 +315,7 @@ ftrace_init_module(struct module *mod, /* * Structure that defines a return function trace. */ -struct ftrace_retfunc { +struct ftrace_graph_ret { unsigned long ret; /* Return address */ unsigned long func; /* Current function */ unsigned long long calltime; @@ -324,22 +324,22 @@ struct ftrace_retfunc { unsigned long overrun; }; -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER #define FTRACE_RETFUNC_DEPTH 50 #define FTRACE_RETSTACK_ALLOC_SIZE 32 /* Type of a callback handler of tracing return function */ -typedef void (*trace_function_return_t)(struct ftrace_retfunc *); +typedef void (*trace_function_graph_t)(struct ftrace_graph_ret *); -extern int register_ftrace_return(trace_function_return_t func); +extern int register_ftrace_graph(trace_function_graph_t func); /* The current handler in use */ -extern trace_function_return_t ftrace_function_return; -extern void unregister_ftrace_return(void); +extern trace_function_graph_t ftrace_graph_function; +extern void unregister_ftrace_graph(void); -extern void ftrace_retfunc_init_task(struct task_struct *t); -extern void ftrace_retfunc_exit_task(struct task_struct *t); +extern void ftrace_graph_init_task(struct task_struct *t); +extern void ftrace_graph_exit_task(struct task_struct *t); #else -static inline void ftrace_retfunc_init_task(struct task_struct *t) { } -static inline void ftrace_retfunc_exit_task(struct task_struct *t) { } +static inline void ftrace_graph_init_task(struct task_struct *t) { } +static inline void ftrace_graph_exit_task(struct task_struct *t) { } #endif #endif /* _LINUX_FTRACE_H */ diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h index 0b4df55d7a7..366a054d0b0 100644 --- a/include/linux/ftrace_irq.h +++ b/include/linux/ftrace_irq.h @@ -2,7 +2,7 @@ #define _LINUX_FTRACE_IRQ_H -#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_RET_TRACER) +#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER) extern void ftrace_nmi_enter(void); extern void ftrace_nmi_exit(void); #else diff --git a/include/linux/sched.h b/include/linux/sched.h index d02a0ca70ee..7ad48f2a275 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1365,7 +1365,7 @@ struct task_struct { unsigned long default_timer_slack_ns; struct list_head *scm_work_list; -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored adress in ret_stack */ int curr_ret_stack; /* Stack of return addresses for return function tracing */ diff --git a/kernel/Makefile b/kernel/Makefile index 03a45e7e87b..703cf3b7389 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -21,7 +21,7 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_sched.o = -pg endif -ifdef CONFIG_FUNCTION_RET_TRACER +ifdef CONFIG_FUNCTION_GRAPH_TRACER CFLAGS_REMOVE_extable.o = -pg # For __kernel_text_address() CFLAGS_REMOVE_module.o = -pg # For __module_text_address() endif diff --git a/kernel/fork.c b/kernel/fork.c index d6e1a3205f6..5f82a999c03 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -140,7 +140,7 @@ void free_task(struct task_struct *tsk) prop_local_destroy_single(&tsk->dirties); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); - ftrace_retfunc_exit_task(tsk); + ftrace_graph_exit_task(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); @@ -1271,7 +1271,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); - ftrace_retfunc_init_task(p); + ftrace_graph_init_task(p); proc_fork_connector(p); cgroup_post_fork(p); return p; diff --git a/kernel/sched.c b/kernel/sched.c index 388d9db044a..52490bf6b88 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5901,7 +5901,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; - ftrace_retfunc_init_task(idle); + ftrace_graph_init_task(idle); } /* diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 620feadff67..eb9b901e077 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -12,7 +12,7 @@ config NOP_TRACER config HAVE_FUNCTION_TRACER bool -config HAVE_FUNCTION_RET_TRACER +config HAVE_FUNCTION_GRAPH_TRACER bool config HAVE_FUNCTION_TRACE_MCOUNT_TEST @@ -63,15 +63,18 @@ config FUNCTION_TRACER (the bootup default), then the overhead of the instructions is very small and not measurable even in micro-benchmarks. -config FUNCTION_RET_TRACER - bool "Kernel Function return Tracer" - depends on HAVE_FUNCTION_RET_TRACER +config FUNCTION_GRAPH_TRACER + bool "Kernel Function Graph Tracer" + depends on HAVE_FUNCTION_GRAPH_TRACER depends on FUNCTION_TRACER help - Enable the kernel to trace a function at its return. - It's first purpose is to trace the duration of functions. - This is done by setting the current return address on the thread - info structure of the current task. + Enable the kernel to trace a function at both its return + and its entry. + It's first purpose is to trace the duration of functions and + draw a call graph for each thread with some informations like + the return value. + This is done by setting the current return address on the current + task structure into a stack of calls. config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index cef4bcb4e82..08c5fe6ddc0 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -29,7 +29,7 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o obj-$(CONFIG_STACK_TRACER) += trace_stack.o obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o obj-$(CONFIG_BOOT_TRACER) += trace_boot.o -obj-$(CONFIG_FUNCTION_RET_TRACER) += trace_functions_return.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_BTS_TRACER) += trace_bts.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 53042f118f2..9e19976af72 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -395,11 +395,11 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) unsigned long ip, fl; unsigned long ftrace_addr; -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER if (ftrace_tracing_type == FTRACE_TYPE_ENTER) ftrace_addr = (unsigned long)ftrace_caller; else - ftrace_addr = (unsigned long)ftrace_return_caller; + ftrace_addr = (unsigned long)ftrace_graph_caller; #else ftrace_addr = (unsigned long)ftrace_caller; #endif @@ -1496,13 +1496,13 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, return ret; } -#ifdef CONFIG_FUNCTION_RET_TRACER +#ifdef CONFIG_FUNCTION_GRAPH_TRACER static atomic_t ftrace_retfunc_active; /* The callback that hooks the return of a function */ -trace_function_return_t ftrace_function_return = - (trace_function_return_t)ftrace_stub; +trace_function_graph_t ftrace_graph_function = + (trace_function_graph_t)ftrace_stub; /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ @@ -1549,7 +1549,7 @@ free: } /* Allocate a return stack for each task */ -static int start_return_tracing(void) +static int start_graph_tracing(void) { struct ftrace_ret_stack **ret_stack_list; int ret; @@ -1569,7 +1569,7 @@ static int start_return_tracing(void) return ret; } -int register_ftrace_return(trace_function_return_t func) +int register_ftrace_graph(trace_function_graph_t func) { int ret = 0; @@ -1584,13 +1584,13 @@ int register_ftrace_return(trace_function_return_t func) goto out; } atomic_inc(&ftrace_retfunc_active); - ret = start_return_tracing(); + ret = start_graph_tracing(); if (ret) { atomic_dec(&ftrace_retfunc_active); goto out; } ftrace_tracing_type = FTRACE_TYPE_RETURN; - ftrace_function_return = func; + ftrace_graph_function = func; ftrace_startup(); out: @@ -1598,12 +1598,12 @@ out: return ret; } -void unregister_ftrace_return(void) +void unregister_ftrace_graph(void) { mutex_lock(&ftrace_sysctl_lock); atomic_dec(&ftrace_retfunc_active); - ftrace_function_return = (trace_function_return_t)ftrace_stub; + ftrace_graph_function = (trace_function_graph_t)ftrace_stub; ftrace_shutdown(); /* Restore normal tracing type */ ftrace_tracing_type = FTRACE_TYPE_ENTER; @@ -1612,7 +1612,7 @@ void unregister_ftrace_return(void) } /* Allocate a return stack for newly created task */ -void ftrace_retfunc_init_task(struct task_struct *t) +void ftrace_graph_init_task(struct task_struct *t) { if (atomic_read(&ftrace_retfunc_active)) { t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH @@ -1626,7 +1626,7 @@ void ftrace_retfunc_init_task(struct task_struct *t) t->ret_stack = NULL; } -void ftrace_retfunc_exit_task(struct task_struct *t) +void ftrace_graph_exit_task(struct task_struct *t) { struct ftrace_ret_stack *ret_stack = t->ret_stack; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8df8fdd69c9..f21ab2c68fd 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -878,15 +878,15 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data, ring_buffer_unlock_commit(tr->buffer, event, irq_flags); } -#ifdef CONFIG_FUNCTION_RET_TRACER -static void __trace_function_return(struct trace_array *tr, +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void __trace_function_graph(struct trace_array *tr, struct trace_array_cpu *data, - struct ftrace_retfunc *trace, + struct ftrace_graph_ret *trace, unsigned long flags, int pc) { struct ring_buffer_event *event; - struct ftrace_ret_entry *entry; + struct ftrace_graph_entry *entry; unsigned long irq_flags; if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) @@ -1177,8 +1177,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) local_irq_restore(flags); } -#ifdef CONFIG_FUNCTION_RET_TRACER -void trace_function_return(struct ftrace_retfunc *trace) +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +void trace_function_graph(struct ftrace_graph_ret *trace) { struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -1193,12 +1193,12 @@ void trace_function_return(struct ftrace_retfunc *trace) disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); - __trace_function_return(tr, data, trace, flags, pc); + __trace_function_graph(tr, data, trace, flags, pc); } atomic_dec(&data->disabled); raw_local_irq_restore(flags); } -#endif /* CONFIG_FUNCTION_RET_TRACER */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ static struct ftrace_ops trace_ops __read_mostly = { @@ -2001,7 +2001,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) break; } case TRACE_FN_RET: { - return print_return_function(iter); + return print_graph_function(iter); break; } case TRACE_BRANCH: { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3abd645e8af..72b5ef86876 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -57,7 +57,7 @@ struct ftrace_entry { }; /* Function return entry */ -struct ftrace_ret_entry { +struct ftrace_graph_entry { struct trace_entry ent; unsigned long ip; unsigned long parent_ip; @@ -264,7 +264,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct trace_boot_call, TRACE_BOOT_CALL);\ IF_ASSIGN(var, ent, struct trace_boot_ret, TRACE_BOOT_RET);\ IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ - IF_ASSIGN(var, ent, struct ftrace_ret_entry, TRACE_FN_RET);\ + IF_ASSIGN(var, ent, struct ftrace_graph_entry, TRACE_FN_RET);\ IF_ASSIGN(var, ent, struct bts_entry, TRACE_BTS);\ __ftrace_bad_type(); \ } while (0) @@ -398,7 +398,7 @@ void trace_function(struct trace_array *tr, unsigned long parent_ip, unsigned long flags, int pc); void -trace_function_return(struct ftrace_retfunc *trace); +trace_function_graph(struct ftrace_graph_ret *trace); void trace_bts(struct trace_array *tr, unsigned long from, @@ -489,11 +489,11 @@ extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern unsigned long trace_flags; /* Standard output formatting function used for function return traces */ -#ifdef CONFIG_FUNCTION_RET_TRACER -extern enum print_line_t print_return_function(struct trace_iterator *iter); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +extern enum print_line_t print_graph_function(struct trace_iterator *iter); #else static inline enum print_line_t -print_return_function(struct trace_iterator *iter) +print_graph_function(struct trace_iterator *iter) { return TRACE_TYPE_UNHANDLED; } diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c new file mode 100644 index 00000000000..f5bad4624d2 --- /dev/null +++ b/kernel/trace/trace_functions_graph.c @@ -0,0 +1,98 @@ +/* + * + * Function graph tracer. + * Copyright (c) 2008 Frederic Weisbecker + * Mostly borrowed from function tracer which + * is Copyright (c) Steven Rostedt + * + */ +#include +#include +#include +#include + +#include "trace.h" + + +#define TRACE_GRAPH_PRINT_OVERRUN 0x1 +static struct tracer_opt trace_opts[] = { + /* Display overruns or not */ + { TRACER_OPT(overrun, TRACE_GRAPH_PRINT_OVERRUN) }, + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, /* Don't display overruns by default */ + .opts = trace_opts +}; + + +static int graph_trace_init(struct trace_array *tr) +{ + int cpu; + for_each_online_cpu(cpu) + tracing_reset(tr, cpu); + + return register_ftrace_graph(&trace_function_graph); +} + +static void graph_trace_reset(struct trace_array *tr) +{ + unregister_ftrace_graph(); +} + + +enum print_line_t +print_graph_function(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct ftrace_graph_entry *field; + int ret; + + if (entry->type == TRACE_FN_RET) { + trace_assign_type(field, entry); + ret = trace_seq_printf(s, "%pF -> ", (void *)field->parent_ip); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = seq_print_ip_sym(s, field->ip, + trace_flags & TRACE_ITER_SYM_MASK); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " (%llu ns)", + field->rettime - field->calltime); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { + ret = trace_seq_printf(s, " (Overruns: %lu)", + field->overrun); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; + } + return TRACE_TYPE_UNHANDLED; +} + +static struct tracer graph_trace __read_mostly = { + .name = "function-graph", + .init = graph_trace_init, + .reset = graph_trace_reset, + .print_line = print_graph_function, + .flags = &tracer_flags, +}; + +static __init int init_graph_trace(void) +{ + return register_tracer(&graph_trace); +} + +device_initcall(init_graph_trace); -- cgit v1.2.3-70-g09d2 From 2c5643b1c5c7fbb13f340d4c58944d9642f41796 Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Sun, 30 Nov 2008 17:16:04 +0900 Subject: x86: provide readq()/writeq() on 32-bit too Impact: add new API for drivers Add implementation of readq/writeq to x86_32, and add config value to the x86 architecture to determine existence of readq/writeq. Signed-off-by: Hitoshi Mitake Acked-by: Sam Ravnborg Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 ++ arch/x86/include/asm/io.h | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f..a7d50f5d118 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -19,6 +19,8 @@ config X86_64 config X86 def_bool y select HAVE_AOUT if X86_32 + select HAVE_READQ + select HAVE_WRITEQ select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index ac2abc88cd9..25946449df4 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -4,6 +4,7 @@ #define ARCH_HAS_IOREMAP_WC #include +#include #define build_mmio_read(name, size, type, reg, barrier) \ static inline type name(const volatile void __iomem *addr) \ @@ -57,6 +58,29 @@ build_mmio_write(__writeq, "q", unsigned long, "r", ) /* Let people know we have them */ #define readq readq #define writeq writeq + +#else /* CONFIG_X86_32 from here */ + +static inline __u64 readq(const volatile void __iomem *addr) +{ + const volatile u32 __iomem *p = addr; + u32 l, h; + + l = readl(p); + h = readl(p + 1); + + return l + ((u64)h << 32); +} + +static inline void writeq(__u64 val, volatile void __iomem *addr) +{ + writel(val, addr); + writel(val >> 32, addr+4); +} + +#define readq readq +#define writeq writeq + #endif extern int iommu_bio_merge; -- cgit v1.2.3-70-g09d2 From 48d68b20d00865035b8b65e69af343d0f53fac9d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 2 Dec 2008 00:20:39 +0100 Subject: tracing/function-graph-tracer: support for x86-64 Impact: extend and enable the function graph tracer to 64-bit x86 This patch implements the support for function graph tracer under x86-64. Both static and dynamic tracing are supported. This causes some small CPP conditional asm on arch/x86/kernel/ftrace.c I wanted to use probe_kernel_read/write to make the return address saving/patching code more generic but it causes tracing recursion. That would be perhaps useful to implement a notrace version of these function for other archs ports. Note that arch/x86/process_64.c is not traced, as in X86-32. I first thought __switch_to() was responsible of crashes during tracing because I believed current task were changed inside but that's actually not the case (actually yes, but not the "current" pointer). So I will have to investigate to find the functions that harm here, to enable tracing of the other functions inside (but there is no issue at this time, while process_64.c stays out of -pg flags). A little possible race condition is fixed inside this patch too. When the tracer allocate a return stack dynamically, the current depth is not initialized before but after. An interrupt could occur at this time and, after seeing that the return stack is allocated, the tracer could try to trace it with a random uninitialized depth. It's a prevention, even if I hadn't problems with it. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Tim Bird Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/entry_64.S | 74 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/ftrace.c | 11 ++++++- kernel/trace/ftrace.c | 4 ++- 5 files changed, 89 insertions(+), 3 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0842b112768..45c86fb9413 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -29,7 +29,7 @@ config X86 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER - select HAVE_FUNCTION_GRAPH_TRACER if X86_32 + select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 64939a0c398..d274425fb07 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -17,6 +17,7 @@ endif ifdef CONFIG_FUNCTION_GRAPH_TRACER # Don't trace __switch_to() but let it for function tracer CFLAGS_REMOVE_process_32.o = -pg +CFLAGS_REMOVE_process_64.o = -pg endif # diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 08aa6b10933..2aa0526ac30 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -98,6 +98,12 @@ ftrace_call: movq (%rsp), %rax addq $0x38, %rsp +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + .globl ftrace_stub ftrace_stub: retq @@ -110,6 +116,12 @@ ENTRY(mcount) cmpq $ftrace_stub, ftrace_trace_function jnz trace + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpq $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller +#endif + .globl ftrace_stub ftrace_stub: retq @@ -145,6 +157,68 @@ END(mcount) #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + cmpl $0, function_trace_stop + jne ftrace_stub + + subq $0x38, %rsp + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + + leaq 8(%rbp), %rdi + movq 0x38(%rsp), %rsi + + call prepare_ftrace_return + + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $0x38, %rsp + retq +END(ftrace_graph_caller) + + +.globl return_to_handler +return_to_handler: + subq $80, %rsp + + movq %rax, (%rsp) + movq %rcx, 8(%rsp) + movq %rdx, 16(%rsp) + movq %rsi, 24(%rsp) + movq %rdi, 32(%rsp) + movq %r8, 40(%rsp) + movq %r9, 48(%rsp) + movq %r10, 56(%rsp) + movq %r11, 64(%rsp) + + call ftrace_return_to_handler + + movq %rax, 72(%rsp) + movq 64(%rsp), %r11 + movq 56(%rsp), %r10 + movq 48(%rsp), %r9 + movq 40(%rsp), %r8 + movq 32(%rsp), %rdi + movq 24(%rsp), %rsi + movq 16(%rsp), %rdx + movq 8(%rsp), %rcx + movq (%rsp), %rax + addq $72, %rsp + retq +#endif + + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 7ef914e6a2f..58832478b94 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -467,8 +467,13 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) * ignore such a protection. */ asm volatile( +#ifdef CONFIG_X86_64 + "1: movq (%[parent_old]), %[old]\n" + "2: movq %[return_hooker], (%[parent_replaced])\n" +#else "1: movl (%[parent_old]), %[old]\n" "2: movl %[return_hooker], (%[parent_replaced])\n" +#endif " movl $0, %[faulted]\n" ".section .fixup, \"ax\"\n" @@ -476,8 +481,13 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) ".previous\n" ".section __ex_table, \"a\"\n" +#ifdef CONFIG_X86_64 + " .quad 1b, 3b\n" + " .quad 2b, 3b\n" +#else " .long 1b, 3b\n" " .long 2b, 3b\n" +#endif ".previous\n" : [parent_replaced] "=r" (parent), [old] "=r" (old), @@ -509,5 +519,4 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) ftrace_graph_entry(&trace); } - #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 08b536a2614..f7249962752 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1673,8 +1673,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } if (t->ret_stack == NULL) { - t->ret_stack = ret_stack_list[start++]; t->curr_ret_stack = -1; + /* Make sure IRQs see the -1 first: */ + barrier(); + t->ret_stack = ret_stack_list[start++]; atomic_set(&t->trace_overrun, 0); } } while_each_thread(g, t); -- cgit v1.2.3-70-g09d2 From 8daa19051e1c7369c89ace7b18e74fe1f55dfa29 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Mon, 1 Dec 2008 14:13:53 -0800 Subject: x86, apm: remove CONFIG_APM_REAL_MODE_POWER_OFF in favor of a kernel parameter Remove CONFIG_APM_REAL_MODE_POWER_OFF like CONFIG_APM_POWER_OFF which has been done for linux-2.2.14pre8 (http://lkml.org/lkml/1999/11/23/3). Re-introducing CONFIG_APM_POWER_OFF got nack-ed. Stephen didn't bother to remove CONFIG_APM_REAL_MODE_POWER_OFF, let's get rid of it now. Reference: http://lkml.org/lkml/2008/5/7/97 Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 7 ------- arch/x86/kernel/apm_32.c | 4 ---- 2 files changed, 11 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4cf0ab13d18..ebcad15ccf3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1629,13 +1629,6 @@ config APM_ALLOW_INTS many of the newer IBM Thinkpads. If you experience hangs when you suspend, try setting this to Y. Otherwise, say N. -config APM_REAL_MODE_POWER_OFF - bool "Use real mode APM BIOS call to power off" - help - Use real mode APM BIOS calls to switch off the computer. This is - a work-around for a number of buggy BIOSes. Switch this option on if - your computer crashes instead of powering off properly. - endif # APM source "arch/x86/kernel/cpu/cpufreq/Kconfig" diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5145a6e72bb..3a26525a3f3 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -391,11 +391,7 @@ static int power_off; #else static int power_off = 1; #endif -#ifdef CONFIG_APM_REAL_MODE_POWER_OFF -static int realmode_power_off = 1; -#else static int realmode_power_off; -#endif #ifdef CONFIG_APM_ALLOW_INTS static int allow_ints = 1; #else -- cgit v1.2.3-70-g09d2 From 0b8f1efad30bd58f89961b82dfe68b9edf8fd2ac Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 5 Dec 2008 18:58:31 -0800 Subject: sparse irq_desc[] array: core kernel and x86 changes Impact: new feature Problem on distro kernels: irq_desc[NR_IRQS] takes megabytes of RAM with NR_CPUS set to large values. The goal is to be able to scale up to much larger NR_IRQS value without impacting the (important) common case. To solve this, we generalize irq_desc[NR_IRQS] to an (optional) array of irq_desc pointers. When CONFIG_SPARSE_IRQ=y is used, we use kzalloc_node to get irq_desc, this also makes the IRQ descriptors NUMA-local (to the site that calls request_irq()). This gets rid of the irq_cfg[] static array on x86 as well: irq_cfg now uses desc->chip_data for x86 to store irq_cfg. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 10 ++ arch/x86/include/asm/irq_vectors.h | 9 ++ arch/x86/kernel/io_apic.c | 301 +++++++++++++++++++++++-------------- arch/x86/kernel/irq.c | 3 + arch/x86/kernel/irq_32.c | 2 + arch/x86/kernel/irq_64.c | 2 + arch/x86/kernel/irqinit_32.c | 1 - arch/x86/kernel/irqinit_64.c | 1 - drivers/char/random.c | 22 +-- drivers/pci/intr_remapping.c | 76 +++++++++- drivers/xen/events.c | 12 +- fs/proc/stat.c | 17 ++- include/linux/interrupt.h | 2 + include/linux/irq.h | 54 ++++++- include/linux/irqnr.h | 14 +- include/linux/kernel_stat.h | 14 +- include/linux/random.h | 51 +++++++ init/main.c | 11 ++ kernel/irq/autoprobe.c | 15 ++ kernel/irq/chip.c | 3 +- kernel/irq/handle.c | 181 +++++++++++++++++++++- kernel/irq/proc.c | 6 +- kernel/irq/spurious.c | 5 + 23 files changed, 649 insertions(+), 163 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f..48ac688de3c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -238,6 +238,16 @@ config X86_HAS_BOOT_CPU_ID def_bool y depends on X86_VOYAGER +config SPARSE_IRQ + bool "Support sparse irq numbering" + depends on PCI_MSI || HT_IRQ + default y + help + This enables support for sparse irq, esp for msi/msi-x. You may need + if you have lots of cards supports msi-x installed. + + If you don't know what to do here, say Y. + config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 0005adb0f94..bb6b69a6b12 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -102,11 +102,20 @@ #define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15) #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER) + +#ifndef CONFIG_SPARSE_IRQ # if NR_CPUS < MAX_IO_APICS # define NR_IRQS (NR_VECTORS + (32 * NR_CPUS)) # else # define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) # endif +#else +# if (8 * NR_CPUS) > (32 * MAX_IO_APICS) +# define NR_IRQS (NR_VECTORS + (8 * NR_CPUS)) +# else +# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) +# endif +#endif #elif defined(CONFIG_X86_VOYAGER) diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 9043251210f..9de17f5c112 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -108,8 +108,33 @@ static int __init parse_noapic(char *str) early_param("noapic", parse_noapic); struct irq_pin_list; + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) +{ + struct irq_pin_list *pin; + int node; + + node = cpu_to_node(cpu); + + pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); + printk(KERN_DEBUG " alloc irq_2_pin on cpu %d node %d\n", cpu, node); + + return pin; +} + struct irq_cfg { - unsigned int irq; struct irq_pin_list *irq_2_pin; cpumask_t domain; cpumask_t old_domain; @@ -119,83 +144,93 @@ struct irq_cfg { }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +#ifdef CONFIG_SPARSE_IRQ +static struct irq_cfg irq_cfgx[] = { +#else static struct irq_cfg irq_cfgx[NR_IRQS] = { - [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, - [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, - [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, - [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, - [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, - [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, - [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, - [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, - [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, - [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, - [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, - [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, - [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, - [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, - [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, - [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, +#endif + [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, + [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, + [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, + [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, + [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, + [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, + [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, + [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, + [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, + [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, + [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, + [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, + [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, + [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, + [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, + [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, }; -#define for_each_irq_cfg(irq, cfg) \ - for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) - -static struct irq_cfg *irq_cfg(unsigned int irq) +void __init arch_early_irq_init(void) { - return irq < nr_irqs ? irq_cfgx + irq : NULL; -} + struct irq_cfg *cfg; + struct irq_desc *desc; + int count; + int i; -static struct irq_cfg *irq_cfg_alloc(unsigned int irq) -{ - return irq_cfg(irq); -} + cfg = irq_cfgx; + count = ARRAY_SIZE(irq_cfgx); -/* - * Rough estimation of how many shared IRQs there are, can be changed - * anytime. - */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) + for (i = 0; i < count; i++) { + desc = irq_to_desc(i); + desc->chip_data = &cfg[i]; + } +} -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ +#ifdef CONFIG_SPARSE_IRQ +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + struct irq_cfg *cfg = NULL; + struct irq_desc *desc; -struct irq_pin_list { - int apic, pin; - struct irq_pin_list *next; -}; + desc = irq_to_desc(irq); + if (desc) + cfg = desc->chip_data; -static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; -static struct irq_pin_list *irq_2_pin_ptr; + return cfg; +} -static void __init irq_2_pin_init(void) +static struct irq_cfg *get_one_free_irq_cfg(int cpu) { - struct irq_pin_list *pin = irq_2_pin_head; - int i; + struct irq_cfg *cfg; + int node; + + node = cpu_to_node(cpu); - for (i = 1; i < PIN_MAP_SIZE; i++) - pin[i-1].next = &pin[i]; + cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); + printk(KERN_DEBUG " alloc irq_cfg on cpu %d node %d\n", cpu, node); - irq_2_pin_ptr = &pin[0]; + return cfg; } -static struct irq_pin_list *get_one_free_irq_2_pin(void) +void arch_init_chip_data(struct irq_desc *desc, int cpu) { - struct irq_pin_list *pin = irq_2_pin_ptr; + struct irq_cfg *cfg; - if (!pin) - panic("can not get more irq_2_pin\n"); + cfg = desc->chip_data; + if (!cfg) { + desc->chip_data = get_one_free_irq_cfg(cpu); + if (!desc->chip_data) { + printk(KERN_ERR "can not alloc irq_cfg\n"); + BUG_ON(1); + } + } +} - irq_2_pin_ptr = pin->next; - pin->next = NULL; - return pin; +#else +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + return irq < nr_irqs ? irq_cfgx + irq : NULL; } +#endif + struct io_apic { unsigned int index; unsigned int unused[3]; @@ -397,16 +432,19 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) +static void add_pin_to_irq_cpu(unsigned int irq, int cpu, int apic, int pin) { - struct irq_cfg *cfg; struct irq_pin_list *entry; + struct irq_cfg *cfg = irq_cfg(irq); - /* first time to refer irq_cfg, so with new */ - cfg = irq_cfg_alloc(irq); entry = cfg->irq_2_pin; if (!entry) { - entry = get_one_free_irq_2_pin(); + entry = get_one_free_irq_2_pin(cpu); + if (!entry) { + printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", + apic, pin); + return; + } cfg->irq_2_pin = entry; entry->apic = apic; entry->pin = pin; @@ -421,7 +459,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) entry = entry->next; } - entry->next = get_one_free_irq_2_pin(); + entry->next = get_one_free_irq_2_pin(cpu); entry = entry->next; entry->apic = apic; entry->pin = pin; @@ -430,7 +468,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) /* * Reroute an IRQ to a different pin. */ -static void __init replace_pin_at_irq(unsigned int irq, +static void __init replace_pin_at_irq(unsigned int irq, int cpu, int oldapic, int oldpin, int newapic, int newpin) { @@ -451,7 +489,7 @@ static void __init replace_pin_at_irq(unsigned int irq, /* why? call replace before add? */ if (!replaced) - add_pin_to_irq(irq, newapic, newpin); + add_pin_to_irq_cpu(irq, cpu, newapic, newpin); } static inline void io_apic_modify_irq(unsigned int irq, @@ -1162,9 +1200,13 @@ void __setup_vector_irq(int cpu) /* This function must be called with vector_lock held */ int irq, vector; struct irq_cfg *cfg; + struct irq_desc *desc; /* Mark the inuse vectors */ - for_each_irq_cfg(irq, cfg) { + for_each_irq_desc(irq, desc) { + if (!desc) + continue; + cfg = desc->chip_data; if (!cpu_isset(cpu, cfg->domain)) continue; vector = cfg->vector; @@ -1356,6 +1398,8 @@ static void __init setup_IO_APIC_irqs(void) { int apic, pin, idx, irq; int notcon = 0; + struct irq_desc *desc; + int cpu = boot_cpu_id; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1387,7 +1431,12 @@ static void __init setup_IO_APIC_irqs(void) if (multi_timer_check(apic, irq)) continue; #endif - add_pin_to_irq(irq, apic, pin); + desc = irq_to_desc_alloc_cpu(irq, cpu); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + continue; + } + add_pin_to_irq_cpu(irq, cpu, apic, pin); setup_IO_APIC_irq(apic, pin, irq, irq_trigger(idx), irq_polarity(idx)); @@ -1448,6 +1497,7 @@ __apicdebuginit(void) print_IO_APIC(void) union IO_APIC_reg_03 reg_03; unsigned long flags; struct irq_cfg *cfg; + struct irq_desc *desc; unsigned int irq; if (apic_verbosity == APIC_QUIET) @@ -1537,8 +1587,13 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for_each_irq_cfg(irq, cfg) { - struct irq_pin_list *entry = cfg->irq_2_pin; + for_each_irq_desc(irq, desc) { + struct irq_pin_list *entry; + + if (!desc) + continue; + cfg = desc->chip_data; + entry = cfg->irq_2_pin; if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", irq); @@ -2022,6 +2077,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) { int was_pending = 0; unsigned long flags; + struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); if (irq < 16) { @@ -2029,6 +2085,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) if (i8259A_irq_pending(irq)) was_pending = 1; } + cfg = irq_cfg(irq); __unmask_IO_APIC_irq(irq); spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2178,6 +2235,9 @@ static void ir_irq_migration(struct work_struct *work) struct irq_desc *desc; for_each_irq_desc(irq, desc) { + if (!desc) + continue; + if (desc->status & IRQ_MOVE_PENDING) { unsigned long flags; @@ -2229,6 +2289,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) struct irq_cfg *cfg; irq = __get_cpu_var(vector_irq)[vector]; + if (irq == -1) + continue; + desc = irq_to_desc(irq); if (!desc) continue; @@ -2430,8 +2493,12 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for_each_irq_cfg(irq, cfg) { - if (IO_APIC_IRQ(irq) && !cfg->vector) { + for_each_irq_desc(irq, desc) { + if (!desc) + continue; + + cfg = desc->chip_data; + if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 @@ -2439,11 +2506,9 @@ static inline void init_IO_APIC_traps(void) */ if (irq < 16) make_8259A_irq(irq); - else { - desc = irq_to_desc(irq); + else /* Strange. Oh, well.. */ desc->chip = &no_irq_chip; - } } } } @@ -2654,7 +2719,7 @@ static inline void __init check_timer(void) * Ok, does IRQ0 through the IOAPIC work? */ if (no_pin1) { - add_pin_to_irq(0, apic1, pin1); + add_pin_to_irq_cpu(0, boot_cpu_id, apic1, pin1); setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } unmask_IO_APIC_irq(0); @@ -2683,7 +2748,7 @@ static inline void __init check_timer(void) /* * legacy devices should be connected to IO APIC #0 */ - replace_pin_at_irq(0, apic1, pin1, apic2, pin2); + replace_pin_at_irq(0, boot_cpu_id, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); unmask_IO_APIC_irq(0); enable_8259A_irq(0); @@ -2902,21 +2967,25 @@ unsigned int create_irq_nr(unsigned int irq_want) unsigned int irq; unsigned int new; unsigned long flags; - struct irq_cfg *cfg_new; - - irq_want = nr_irqs - 1; + struct irq_cfg *cfg_new = NULL; + int cpu = boot_cpu_id; + struct irq_desc *desc_new = NULL; irq = 0; spin_lock_irqsave(&vector_lock, flags); for (new = irq_want; new > 0; new--) { if (platform_legacy_irq(new)) continue; - cfg_new = irq_cfg(new); - if (cfg_new && cfg_new->vector != 0) + + desc_new = irq_to_desc_alloc_cpu(new, cpu); + if (!desc_new) { + printk(KERN_INFO "can not get irq_desc for %d\n", new); + continue; + } + cfg_new = desc_new->chip_data; + + if (cfg_new->vector != 0) continue; - /* check if need to create one */ - if (!cfg_new) - cfg_new = irq_cfg_alloc(new); if (__assign_irq_vector(new, TARGET_CPUS) == 0) irq = new; break; @@ -2925,6 +2994,9 @@ unsigned int create_irq_nr(unsigned int irq_want) if (irq > 0) { dynamic_irq_init(irq); + /* restore it, in case dynamic_irq_init clear it */ + if (desc_new) + desc_new->chip_data = cfg_new; } return irq; } @@ -2944,8 +3016,16 @@ int create_irq(void) void destroy_irq(unsigned int irq) { unsigned long flags; + struct irq_cfg *cfg; + struct irq_desc *desc; + /* store it, in case dynamic_irq_cleanup clear it */ + desc = irq_to_desc(irq); + cfg = desc->chip_data; dynamic_irq_cleanup(irq); + /* connect back irq_cfg */ + if (desc) + desc->chip_data = cfg; #ifdef CONFIG_INTR_REMAP free_irte(irq); @@ -3195,26 +3275,13 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) return 0; } -static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) -{ - unsigned int irq; - - irq = dev->bus->number; - irq <<= 8; - irq |= dev->devfn; - irq <<= 12; - - return irq; -} - -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc) { unsigned int irq; int ret; unsigned int irq_want; - irq_want = build_irq_for_pci_dev(dev) + 0x100; - + irq_want = nr_irqs - 1; irq = create_irq_nr(irq_want); if (irq == 0) return -1; @@ -3228,7 +3295,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) goto error; no_ir: #endif - ret = setup_msi_irq(dev, desc, irq); + ret = setup_msi_irq(dev, msidesc, irq); if (ret < 0) { destroy_irq(irq); return ret; @@ -3246,7 +3313,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { unsigned int irq; int ret, sub_handle; - struct msi_desc *desc; + struct msi_desc *msidesc; unsigned int irq_want; #ifdef CONFIG_INTR_REMAP @@ -3254,10 +3321,11 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) int index = 0; #endif - irq_want = build_irq_for_pci_dev(dev) + 0x100; + irq_want = nr_irqs - 1; sub_handle = 0; - list_for_each_entry(desc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want--); + list_for_each_entry(msidesc, &dev->msi_list, list) { + irq = create_irq_nr(irq_want); + irq_want--; if (irq == 0) return -1; #ifdef CONFIG_INTR_REMAP @@ -3289,7 +3357,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) } no_ir: #endif - ret = setup_msi_irq(dev, desc, irq); + ret = setup_msi_irq(dev, msidesc, irq); if (ret < 0) goto error; sub_handle++; @@ -3707,17 +3775,29 @@ int __init io_apic_get_version(int ioapic) int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) { + struct irq_desc *desc; + struct irq_cfg *cfg; + int cpu = boot_cpu_id; + if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", ioapic); return -EINVAL; } + desc = irq_to_desc_alloc_cpu(irq, cpu); + if (!desc) { + printk(KERN_INFO "can not get irq_desc %d\n", irq); + return 0; + } + /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); + if (irq >= 16) { + cfg = desc->chip_data; + add_pin_to_irq_cpu(irq, cpu, ioapic, pin); + } setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); @@ -3773,7 +3853,8 @@ void __init setup_ioapic_dest(void) * when you have too many devices, because at that time only boot * cpu is online. */ - cfg = irq_cfg(irq); + desc = irq_to_desc(irq); + cfg = desc->chip_data; if (!cfg->vector) { setup_IO_APIC_irq(ioapic, pin, irq, irq_trigger(irq_entry), @@ -3785,7 +3866,6 @@ void __init setup_ioapic_dest(void) /* * Honour affinities which have been set in early boot */ - desc = irq_to_desc(irq); if (desc->status & (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) mask = desc->affinity; @@ -3846,7 +3926,6 @@ void __init ioapic_init_mappings(void) struct resource *ioapic_res; int i; - irq_2_pin_init(); ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d1d4dc52f64..3f1d9d18df6 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -118,6 +118,9 @@ int show_interrupts(struct seq_file *p, void *v) } desc = irq_to_desc(i); + if (!desc) + return 0; + spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP any_count = kstat_irqs(i); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index a51382672de..119fc9c8ff7 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -242,6 +242,8 @@ void fixup_irqs(cpumask_t map) for_each_irq_desc(irq, desc) { cpumask_t mask; + if (!desc) + continue; if (irq == 2) continue; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 60eb84eb77a..900009c7059 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -94,6 +94,8 @@ void fixup_irqs(cpumask_t map) int break_affinity = 0; int set_affinity = 1; + if (!desc) + continue; if (irq == 2) continue; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 845aa9803e8..5a5651b7f9e 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -69,7 +69,6 @@ void __init init_ISA_irqs (void) * 16 old-style INTA-cycle interrupts: */ for (i = 0; i < 16; i++) { - /* first time call this irq_desc */ struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index ff023539128..cd9f42d028d 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -143,7 +143,6 @@ void __init init_ISA_irqs(void) init_8259A(0); for (i = 0; i < 16; i++) { - /* first time call this irq_desc */ struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; diff --git a/drivers/char/random.c b/drivers/char/random.c index 675076f5fca..d26891bfcd4 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -558,23 +558,9 @@ struct timer_rand_state { unsigned dont_count_entropy:1; }; -static struct timer_rand_state *irq_timer_state[NR_IRQS]; - -static struct timer_rand_state *get_timer_rand_state(unsigned int irq) -{ - if (irq >= nr_irqs) - return NULL; - - return irq_timer_state[irq]; -} - -static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state) -{ - if (irq >= nr_irqs) - return; - - irq_timer_state[irq] = state; -} +#ifndef CONFIG_SPARSE_IRQ +struct timer_rand_state *irq_timer_state[NR_IRQS]; +#endif static struct timer_rand_state input_timer_state; @@ -933,8 +919,10 @@ void rand_initialize_irq(int irq) { struct timer_rand_state *state; +#ifndef CONFIG_SPARSE_IRQ if (irq >= nr_irqs) return; +#endif state = get_timer_rand_state(irq); diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index 2de5a3238c9..c9958ec5e25 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -19,17 +19,75 @@ struct irq_2_iommu { u8 irte_mask; }; -static struct irq_2_iommu irq_2_iommuX[NR_IRQS]; +#ifdef CONFIG_SPARSE_IRQ +static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu) +{ + struct irq_2_iommu *iommu; + int node; + + node = cpu_to_node(cpu); + + iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node); + printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node); + + return iommu; +} static struct irq_2_iommu *irq_2_iommu(unsigned int irq) { - return (irq < nr_irqs) ? irq_2_iommuX + irq : NULL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + if (WARN_ON_ONCE(!desc)) + return NULL; + + return desc->irq_2_iommu; +} + +static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu) +{ + struct irq_desc *desc; + struct irq_2_iommu *irq_iommu; + + /* + * alloc irq desc if not allocated already. + */ + desc = irq_to_desc_alloc_cpu(irq, cpu); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + return NULL; + } + + irq_iommu = desc->irq_2_iommu; + + if (!irq_iommu) + desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu); + + return desc->irq_2_iommu; } +static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq) +{ + return irq_2_iommu_alloc_cpu(irq, boot_cpu_id); +} + +#else /* !CONFIG_SPARSE_IRQ */ + +static struct irq_2_iommu irq_2_iommuX[NR_IRQS]; + +static struct irq_2_iommu *irq_2_iommu(unsigned int irq) +{ + if (irq < nr_irqs) + return &irq_2_iommuX[irq]; + + return NULL; +} static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq) { return irq_2_iommu(irq); } +#endif static DEFINE_SPINLOCK(irq_2_ir_lock); @@ -86,9 +144,11 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) if (!count) return -1; +#ifndef CONFIG_SPARSE_IRQ /* protect irq_2_iommu_alloc later */ if (irq >= nr_irqs) return -1; +#endif /* * start the IRTE search from index 0. @@ -130,6 +190,12 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) table->base[i].present = 1; irq_iommu = irq_2_iommu_alloc(irq); + if (!irq_iommu) { + spin_unlock(&irq_2_ir_lock); + printk(KERN_ERR "can't allocate irq_2_iommu\n"); + return -1; + } + irq_iommu->iommu = iommu; irq_iommu->irte_index = index; irq_iommu->sub_handle = 0; @@ -177,6 +243,12 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle) irq_iommu = irq_2_iommu_alloc(irq); + if (!irq_iommu) { + spin_unlock(&irq_2_ir_lock); + printk(KERN_ERR "can't allocate irq_2_iommu\n"); + return -1; + } + irq_iommu->iommu = iommu; irq_iommu->irte_index = index; irq_iommu->sub_handle = subhandle; diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 1e3b934a4cf..2924faa7f6c 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -141,8 +141,12 @@ static void init_evtchn_cpu_bindings(void) int i; /* By default all event channels notify CPU#0. */ - for_each_irq_desc(i, desc) + for_each_irq_desc(i, desc) { + if (!desc) + continue; + desc->affinity = cpumask_of_cpu(0); + } #endif memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); @@ -231,7 +235,7 @@ static int find_unbound_irq(void) int irq; /* Only allocate from dynirq range */ - for_each_irq_nr(irq) + for (irq = 0; irq < nr_irqs; irq++) if (irq_bindcount[irq] == 0) break; @@ -792,7 +796,7 @@ void xen_irq_resume(void) mask_evtchn(evtchn); /* No IRQ <-> event-channel mappings. */ - for_each_irq_nr(irq) + for (irq = 0; irq < nr_irqs; irq++) irq_info[irq].evtchn = 0; /* zap event-channel binding */ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) @@ -824,7 +828,7 @@ void __init xen_init_IRQ(void) mask_evtchn(i); /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ - for_each_irq_nr(i) + for (i = 0; i < nr_irqs; i++) irq_bindcount[i] = 0; irq_ctx_init(smp_processor_id()); diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 81904f07679..a13431ab7c6 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -27,6 +27,7 @@ static int show_stat(struct seq_file *p, void *v) u64 sum = 0; struct timespec boottime; unsigned int per_irq_sum; + struct irq_desc *desc; user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; @@ -44,10 +45,11 @@ static int show_stat(struct seq_file *p, void *v) softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); - - for_each_irq_nr(j) + for_each_irq_desc(j, desc) { + if (!desc) + continue; sum += kstat_irqs_cpu(j, i); - + } sum += arch_irq_stat_cpu(i); } sum += arch_irq_stat(); @@ -90,11 +92,14 @@ static int show_stat(struct seq_file *p, void *v) seq_printf(p, "intr %llu", (unsigned long long)sum); /* sum again ? it could be updated? */ - for_each_irq_nr(j) { + for (j = 0; j < NR_IRQS; j++) { + desc = irq_to_desc(j); per_irq_sum = 0; - for_each_possible_cpu(i) - per_irq_sum += kstat_irqs_cpu(j, i); + if (desc) { + for_each_possible_cpu(i) + per_irq_sum += kstat_irqs_cpu(j, i); + } seq_printf(p, " %u", per_irq_sum); } diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f58a0cf8929..79e915e7e8a 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -18,6 +18,8 @@ #include #include +extern int nr_irqs; + /* * These correspond to the IORESOURCE_IRQ_* defines in * linux/ioport.h to select the interrupt line behaviour. When diff --git a/include/linux/irq.h b/include/linux/irq.h index 3dddfa703eb..63b00439d4d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -129,6 +129,8 @@ struct irq_chip { const char *typename; }; +struct timer_rand_state; +struct irq_2_iommu; /** * struct irq_desc - interrupt descriptor * @irq: interrupt number for this descriptor @@ -154,6 +156,13 @@ struct irq_chip { */ struct irq_desc { unsigned int irq; +#ifdef CONFIG_SPARSE_IRQ + struct timer_rand_state *timer_rand_state; + unsigned int *kstat_irqs; +# ifdef CONFIG_INTR_REMAP + struct irq_2_iommu *irq_2_iommu; +# endif +#endif irq_flow_handler_t handle_irq; struct irq_chip *chip; struct msi_desc *msi_desc; @@ -181,14 +190,52 @@ struct irq_desc { const char *name; } ____cacheline_internodealigned_in_smp; +extern void early_irq_init(void); +extern void arch_early_irq_init(void); +extern void arch_init_chip_data(struct irq_desc *desc, int cpu); +extern void arch_init_copy_chip_data(struct irq_desc *old_desc, + struct irq_desc *desc, int cpu); +extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc); + +#ifndef CONFIG_SPARSE_IRQ extern struct irq_desc irq_desc[NR_IRQS]; static inline struct irq_desc *irq_to_desc(unsigned int irq) { - return (irq < nr_irqs) ? irq_desc + irq : NULL; + return (irq < NR_IRQS) ? irq_desc + irq : NULL; +} +static inline struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +{ + return irq_to_desc(irq); } +#ifdef CONFIG_GENERIC_HARDIRQS +# define for_each_irq_desc(irq, desc) \ + for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++) +# define for_each_irq_desc_reverse(irq, desc) \ + for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1); \ + irq >= 0; irq--, desc--) +#endif + +#else + +extern struct irq_desc *irq_to_desc(unsigned int irq); +extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu); +extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu); + +# define for_each_irq_desc(irq, desc) \ + for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; irq++, desc = irq_to_desc(irq)) +# define for_each_irq_desc_reverse(irq, desc) \ + for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0; irq--, desc = irq_to_desc(irq)) + +#define kstat_irqs_this_cpu(DESC) \ + ((DESC)->kstat_irqs[smp_processor_id()]) +#define kstat_incr_irqs_this_cpu(irqno, DESC) \ + ((DESC)->kstat_irqs[smp_processor_id()]++) + +#endif + /* * Migration helpers for obsolete names, they will go away: */ @@ -380,6 +427,11 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); #define get_irq_data(irq) (irq_to_desc(irq)->handler_data) #define get_irq_msi(irq) (irq_to_desc(irq)->msi_desc) +#define get_irq_desc_chip(desc) ((desc)->chip) +#define get_irq_desc_chip_data(desc) ((desc)->chip_data) +#define get_irq_desc_data(desc) ((desc)->handler_data) +#define get_irq_desc_msi(desc) ((desc)->msi_desc) + #endif /* CONFIG_GENERIC_HARDIRQS */ #endif /* !CONFIG_S390 */ diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index 452c280c811..7a299e989f8 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -7,18 +7,10 @@ # define for_each_irq_desc(irq, desc) \ for (irq = 0; irq < nr_irqs; irq++) -#else -extern int nr_irqs; -# define for_each_irq_desc(irq, desc) \ - for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++) - -# define for_each_irq_desc_reverse(irq, desc) \ - for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1); \ - irq >= 0; irq--, desc--) +static inline early_sparse_irq_init(void) +{ +} #endif -#define for_each_irq_nr(irq) \ - for (irq = 0; irq < nr_irqs; irq++) - #endif diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 4a145caeee0..4ee4b3d2316 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -28,7 +28,9 @@ struct cpu_usage_stat { struct kernel_stat { struct cpu_usage_stat cpustat; - unsigned int irqs[NR_IRQS]; +#ifndef CONFIG_SPARSE_IRQ + unsigned int irqs[NR_IRQS]; +#endif }; DECLARE_PER_CPU(struct kernel_stat, kstat); @@ -39,6 +41,10 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); +#ifndef CONFIG_SPARSE_IRQ +#define kstat_irqs_this_cpu(irq) \ + (kstat_this_cpu.irqs[irq]) + struct irq_desc; static inline void kstat_incr_irqs_this_cpu(unsigned int irq, @@ -46,11 +52,17 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, { kstat_this_cpu.irqs[irq]++; } +#endif + +#ifndef CONFIG_SPARSE_IRQ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { return kstat_cpu(cpu).irqs[irq]; } +#else +extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); +#endif /* * Number of interrupts per specific IRQ source, since bootup diff --git a/include/linux/random.h b/include/linux/random.h index 36f125c0c60..ad9daa2374d 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -44,6 +44,57 @@ struct rand_pool_info { extern void rand_initialize_irq(int irq); +struct timer_rand_state; +#ifndef CONFIG_SPARSE_IRQ + +extern struct timer_rand_state *irq_timer_state[]; + +extern int nr_irqs; +static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq) +{ + if (irq >= nr_irqs) + return NULL; + + return irq_timer_state[irq]; +} + +static inline void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state) +{ + if (irq >= nr_irqs) + return; + + irq_timer_state[irq] = state; +} + +#else + +#include +static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + if (!desc) + return NULL; + + return desc->timer_rand_state; +} + +static inline void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + if (!desc) + return; + + desc->timer_rand_state = state; +} +#endif + + extern void add_input_randomness(unsigned int type, unsigned int code, unsigned int value); extern void add_interrupt_randomness(int irq); diff --git a/init/main.c b/init/main.c index 7e117a231af..c1f999a3cf3 100644 --- a/init/main.c +++ b/init/main.c @@ -539,6 +539,15 @@ void __init __weak thread_info_cache_init(void) { } +void __init __weak arch_early_irq_init(void) +{ +} + +void __init __weak early_irq_init(void) +{ + arch_early_irq_init(); +} + asmlinkage void __init start_kernel(void) { char * command_line; @@ -603,6 +612,8 @@ asmlinkage void __init start_kernel(void) sort_main_extable(); trap_init(); rcu_init(); + /* init some links before init_ISA_irqs() */ + early_irq_init(); init_IRQ(); pidhash_init(); init_timers(); diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index cc0f7321b8c..650ce4102a6 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -40,6 +40,9 @@ unsigned long probe_irq_on(void) * flush such a longstanding irq before considering it as spurious. */ for_each_irq_desc_reverse(i, desc) { + if (!desc) + continue; + spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { /* @@ -68,6 +71,9 @@ unsigned long probe_irq_on(void) * happened in the previous stage, it may have masked itself) */ for_each_irq_desc_reverse(i, desc) { + if (!desc) + continue; + spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; @@ -86,6 +92,9 @@ unsigned long probe_irq_on(void) * Now filter out any obviously spurious interrupts */ for_each_irq_desc(i, desc) { + if (!desc) + continue; + spin_lock_irq(&desc->lock); status = desc->status; @@ -124,6 +133,9 @@ unsigned int probe_irq_mask(unsigned long val) int i; for_each_irq_desc(i, desc) { + if (!desc) + continue; + spin_lock_irq(&desc->lock); status = desc->status; @@ -166,6 +178,9 @@ int probe_irq_off(unsigned long val) unsigned int status; for_each_irq_desc(i, desc) { + if (!desc) + continue; + spin_lock_irq(&desc->lock); status = desc->status; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 10b5092e9bf..8e4fce4a1b1 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -24,9 +24,10 @@ */ void dynamic_irq_init(unsigned int irq) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; unsigned long flags; + desc = irq_to_desc(irq); if (!desc) { WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); return; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c815b42d0f5..96ca203eb51 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -15,9 +15,16 @@ #include #include #include +#include +#include #include "internals.h" +/* + * lockdep: we want to handle all irq_desc locks as a single lock-class: + */ +static struct lock_class_key irq_desc_lock_class; + /** * handle_bad_irq - handle spurious and unhandled irqs * @irq: the interrupt number @@ -49,6 +56,155 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); +void __init __attribute__((weak)) arch_early_irq_init(void) +{ +} + +#ifdef CONFIG_SPARSE_IRQ +static struct irq_desc irq_desc_init = { + .irq = -1, + .status = IRQ_DISABLED, + .chip = &no_irq_chip, + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), +#ifdef CONFIG_SMP + .affinity = CPU_MASK_ALL +#endif +}; + +static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) +{ + unsigned long bytes; + char *ptr; + int node; + + /* Compute how many bytes we need per irq and allocate them */ + bytes = nr * sizeof(unsigned int); + + node = cpu_to_node(cpu); + ptr = kzalloc_node(bytes, GFP_ATOMIC, node); + printk(KERN_DEBUG " alloc kstat_irqs on cpu %d node %d\n", cpu, node); + + if (ptr) + desc->kstat_irqs = (unsigned int *)ptr; +} + +void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu) +{ +} + +static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) +{ + memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); + desc->irq = irq; +#ifdef CONFIG_SMP + desc->cpu = cpu; +#endif + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + init_kstat_irqs(desc, cpu, nr_cpu_ids); + if (!desc->kstat_irqs) { + printk(KERN_ERR "can not alloc kstat_irqs\n"); + BUG_ON(1); + } + arch_init_chip_data(desc, cpu); +} + +/* + * Protect the sparse_irqs: + */ +static DEFINE_SPINLOCK(sparse_irq_lock); + +struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly; + +static struct irq_desc irq_desc_legacy[16] __cacheline_aligned_in_smp = { + [0 ... 15] = { + .irq = -1, + .status = IRQ_DISABLED, + .chip = &no_irq_chip, + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), +#ifdef CONFIG_SMP + .affinity = CPU_MASK_ALL +#endif + } +}; + +/* FIXME: use bootmem alloc ...*/ +static unsigned int kstat_irqs_legacy[16][NR_CPUS]; + +void __init early_irq_init(void) +{ + struct irq_desc *desc; + int legacy_count; + int i; + + desc = irq_desc_legacy; + legacy_count = ARRAY_SIZE(irq_desc_legacy); + + for (i = 0; i < legacy_count; i++) { + desc[i].irq = i; + desc[i].kstat_irqs = kstat_irqs_legacy[i]; + + irq_desc_ptrs[i] = desc + i; + } + + for (i = legacy_count; i < NR_IRQS; i++) + irq_desc_ptrs[i] = NULL; + + arch_early_irq_init(); +} + +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL; +} + +struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +{ + struct irq_desc *desc; + unsigned long flags; + int node; + + if (irq >= NR_IRQS) { + printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n", + irq, NR_IRQS); + WARN_ON(1); + return NULL; + } + + desc = irq_desc_ptrs[irq]; + if (desc) + return desc; + + spin_lock_irqsave(&sparse_irq_lock, flags); + + /* We have to check it to avoid races with another CPU */ + desc = irq_desc_ptrs[irq]; + if (desc) + goto out_unlock; + + node = cpu_to_node(cpu); + desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); + printk(KERN_DEBUG " alloc irq_desc for %d on cpu %d node %d\n", + irq, cpu, node); + if (!desc) { + printk(KERN_ERR "can not alloc irq_desc\n"); + BUG_ON(1); + } + init_one_irq_desc(irq, desc, cpu); + + irq_desc_ptrs[irq] = desc; + +out_unlock: + spin_unlock_irqrestore(&sparse_irq_lock, flags); + + return desc; +} + +#else + struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .status = IRQ_DISABLED, @@ -62,6 +218,8 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { } }; +#endif + /* * What should we do if we get a hw irq event on an illegal vector? * Each architecture has to answer this themself. @@ -261,17 +419,28 @@ out: #ifdef CONFIG_TRACE_IRQFLAGS -/* - * lockdep: we want to handle all irq_desc locks as a single lock-class: - */ -static struct lock_class_key irq_desc_lock_class; - void early_init_irq_lock_class(void) { +#ifndef CONFIG_SPARSE_IRQ struct irq_desc *desc; int i; - for_each_irq_desc(i, desc) + for_each_irq_desc(i, desc) { + if (!desc) + continue; + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + } +#endif +} +#endif + +#ifdef CONFIG_SPARSE_IRQ +unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) +{ + struct irq_desc *desc = irq_to_desc(irq); + return desc->kstat_irqs[cpu]; } #endif +EXPORT_SYMBOL(kstat_irqs_cpu); + diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d257e7d6a8a..f6b3440f05b 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -243,7 +243,11 @@ void init_irq_proc(void) /* * Create entries for all existing IRQs. */ - for_each_irq_desc(irq, desc) + for_each_irq_desc(irq, desc) { + if (!desc) + continue; + register_irq_proc(irq, desc); + } } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index dd364c11e56..3738107531f 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -91,6 +91,9 @@ static int misrouted_irq(int irq) int i, ok = 0; for_each_irq_desc(i, desc) { + if (!desc) + continue; + if (!i) continue; @@ -112,6 +115,8 @@ static void poll_spurious_irqs(unsigned long dummy) for_each_irq_desc(i, desc) { unsigned int status; + if (!desc) + continue; if (!i) continue; -- cgit v1.2.3-70-g09d2 From 8a4830f8891be6b4e04809693a24771a4694e0b0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 9 Dec 2008 11:56:20 -0800 Subject: sparseirq: fix !SMP && !PCI_MSI && !HT_IRQ build Ingo Molnar wrote: >>> drivers/pci/intr_remapping.c: In function 'irq_2_iommu_alloc': >>> drivers/pci/intr_remapping.c:72: error: 'boot_cpu_id' undeclared (first use in this function) >>> drivers/pci/intr_remapping.c:72: error: (Each undeclared identifier is reported only once >>> drivers/pci/intr_remapping.c:72: error: for each function it appears in.) sparseirq should only be used with SMP for now. --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 48ac688de3c..8943c13502c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -240,7 +240,7 @@ config X86_HAS_BOOT_CPU_ID config SPARSE_IRQ bool "Support sparse irq numbering" - depends on PCI_MSI || HT_IRQ + depends on (PCI_MSI || HT_IRQ) && SMP default y help This enables support for sparse irq, esp for msi/msi-x. You may need -- cgit v1.2.3-70-g09d2 From b93a531e315e97ef00367099e6b5f19651936e20 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 16 Dec 2008 11:40:27 +0000 Subject: allow bug table entries to use relative pointers (and use it on x86-64) Impact: reduce bug table size This allows reducing the bug table size by half. Perhaps there are other 64-bit architectures that could also make use of this. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++++ arch/x86/include/asm/bug.h | 2 +- include/asm-generic/bug.h | 8 ++++++++ lib/bug.c | 19 +++++++++++++++++-- 4 files changed, 30 insertions(+), 3 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f..ab98cca84e1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -87,6 +87,10 @@ config GENERIC_IOMAP config GENERIC_BUG def_bool y depends on BUG + select GENERIC_BUG_RELATIVE_POINTERS if X86_64 + +config GENERIC_BUG_RELATIVE_POINTERS + bool config GENERIC_HWEIGHT def_bool y diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 3def2065fce..d9cf1cd156d 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -9,7 +9,7 @@ #ifdef CONFIG_X86_32 # define __BUG_C0 "2:\t.long 1b, %c0\n" #else -# define __BUG_C0 "2:\t.quad 1b, %c0\n" +# define __BUG_C0 "2:\t.long 1b - 2b, %c0 - 2b\n" #endif #define BUG() \ diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 12c07c1866b..4c794d73fb8 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -8,9 +8,17 @@ #ifdef CONFIG_GENERIC_BUG #ifndef __ASSEMBLY__ struct bug_entry { +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS unsigned long bug_addr; +#else + signed int bug_addr_disp; +#endif #ifdef CONFIG_DEBUG_BUGVERBOSE +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS const char *file; +#else + signed int file_disp; +#endif unsigned short line; #endif unsigned short flags; diff --git a/lib/bug.c b/lib/bug.c index bfeafd60ee9..300e41afbf9 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -5,6 +5,8 @@ CONFIG_BUG - emit BUG traps. Nothing happens without this. CONFIG_GENERIC_BUG - enable this code. + CONFIG_GENERIC_BUG_RELATIVE_POINTERS - use 32-bit pointers relative to + the containing struct bug_entry for bug_addr and file. CONFIG_DEBUG_BUGVERBOSE - emit full file+line information for each BUG CONFIG_BUG and CONFIG_DEBUG_BUGVERBOSE are potentially user-settable @@ -43,6 +45,15 @@ extern const struct bug_entry __start___bug_table[], __stop___bug_table[]; +static inline unsigned long bug_addr(const struct bug_entry *bug) +{ +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS + return bug->bug_addr; +#else + return (unsigned long)bug + bug->bug_addr_disp; +#endif +} + #ifdef CONFIG_MODULES static LIST_HEAD(module_bug_list); @@ -55,7 +66,7 @@ static const struct bug_entry *module_find_bug(unsigned long bugaddr) unsigned i; for (i = 0; i < mod->num_bugs; ++i, ++bug) - if (bugaddr == bug->bug_addr) + if (bugaddr == bug_addr(bug)) return bug; } return NULL; @@ -108,7 +119,7 @@ const struct bug_entry *find_bug(unsigned long bugaddr) const struct bug_entry *bug; for (bug = __start___bug_table; bug < __stop___bug_table; ++bug) - if (bugaddr == bug->bug_addr) + if (bugaddr == bug_addr(bug)) return bug; return module_find_bug(bugaddr); @@ -133,7 +144,11 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) if (bug) { #ifdef CONFIG_DEBUG_BUGVERBOSE +#ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS file = bug->file; +#else + file = (const char *)bug + bug->file_disp; +#endif line = bug->line; #endif warning = (bug->flags & BUGFLAG_WARNING) != 0; -- cgit v1.2.3-70-g09d2 From 48a1b10aff588833b73994704c47bbd0deb73e9c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 11 Dec 2008 00:15:01 -0800 Subject: x86, sparseirq: move irq_desc according to smp_affinity, v7 Impact: improve NUMA handling by migrating irq_desc on smp_affinity changes if CONFIG_NUMA_MIGRATE_IRQ_DESC is set: - make irq_desc to go with affinity aka irq_desc moving etc - call move_irq_desc in irq_complete_move() - legacy irq_desc is not moved, because they are allocated via static array for logical apic mode, need to add move_desc_in_progress_in_same_domain, otherwise it will not be moved ==> also could need two phases to get irq_desc moved. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 9 +++ arch/x86/kernel/io_apic.c | 142 +++++++++++++++++++++++++++++++++++++++++++++- include/linux/irq.h | 10 ++++ kernel/irq/Makefile | 1 + kernel/irq/chip.c | 12 +++- kernel/irq/handle.c | 15 +++-- kernel/irq/internals.h | 5 ++ kernel/irq/numa_migrate.c | 127 +++++++++++++++++++++++++++++++++++++++++ 8 files changed, 313 insertions(+), 8 deletions(-) create mode 100644 kernel/irq/numa_migrate.c (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8943c13502c..29073532f94 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -248,6 +248,15 @@ config SPARSE_IRQ If you don't know what to do here, say Y. +config NUMA_MIGRATE_IRQ_DESC + bool "Move irq desc when changing irq smp_affinity" + depends on SPARSE_IRQ && SMP + default n + help + This enables moving irq_desc to cpu/node that irq will use handled. + + If you don't know what to do here, say N. + config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index a1a2e070f31..bfe1245b1a3 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -141,6 +141,9 @@ struct irq_cfg { unsigned move_cleanup_count; u8 vector; u8 move_in_progress : 1; +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + u8 move_desc_pending : 1; +#endif }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ @@ -223,6 +226,121 @@ void arch_init_chip_data(struct irq_desc *desc, int cpu) } } +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + +static void +init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu) +{ + struct irq_pin_list *old_entry, *head, *tail, *entry; + + cfg->irq_2_pin = NULL; + old_entry = old_cfg->irq_2_pin; + if (!old_entry) + return; + + entry = get_one_free_irq_2_pin(cpu); + if (!entry) + return; + + entry->apic = old_entry->apic; + entry->pin = old_entry->pin; + head = entry; + tail = entry; + old_entry = old_entry->next; + while (old_entry) { + entry = get_one_free_irq_2_pin(cpu); + if (!entry) { + entry = head; + while (entry) { + head = entry->next; + kfree(entry); + entry = head; + } + /* still use the old one */ + return; + } + entry->apic = old_entry->apic; + entry->pin = old_entry->pin; + tail->next = entry; + tail = entry; + old_entry = old_entry->next; + } + + tail->next = NULL; + cfg->irq_2_pin = head; +} + +static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) +{ + struct irq_pin_list *entry, *next; + + if (old_cfg->irq_2_pin == cfg->irq_2_pin) + return; + + entry = old_cfg->irq_2_pin; + + while (entry) { + next = entry->next; + kfree(entry); + entry = next; + } + old_cfg->irq_2_pin = NULL; +} + +void arch_init_copy_chip_data(struct irq_desc *old_desc, + struct irq_desc *desc, int cpu) +{ + struct irq_cfg *cfg; + struct irq_cfg *old_cfg; + + cfg = get_one_free_irq_cfg(cpu); + + if (!cfg) + return; + + desc->chip_data = cfg; + + old_cfg = old_desc->chip_data; + + memcpy(cfg, old_cfg, sizeof(struct irq_cfg)); + + init_copy_irq_2_pin(old_cfg, cfg, cpu); +} + +static void free_irq_cfg(struct irq_cfg *old_cfg) +{ + kfree(old_cfg); +} + +void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) +{ + struct irq_cfg *old_cfg, *cfg; + + old_cfg = old_desc->chip_data; + cfg = desc->chip_data; + + if (old_cfg == cfg) + return; + + if (old_cfg) { + free_irq_2_pin(old_cfg, cfg); + free_irq_cfg(old_cfg); + old_desc->chip_data = NULL; + } +} + +static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask) +{ + struct irq_cfg *cfg = desc->chip_data; + + if (!cfg->move_in_progress) { + /* it means that domain is not changed */ + if (!cpus_intersects(desc->affinity, mask)) + cfg->move_desc_pending = 1; + } +} +#endif + #else static struct irq_cfg *irq_cfg(unsigned int irq) { @@ -231,9 +349,11 @@ static struct irq_cfg *irq_cfg(unsigned int irq) #endif +#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask) { } +#endif struct io_apic { unsigned int index; @@ -2346,14 +2466,34 @@ static void irq_complete_move(struct irq_desc **descp) struct irq_cfg *cfg = desc->chip_data; unsigned vector, me; - if (likely(!cfg->move_in_progress)) + if (likely(!cfg->move_in_progress)) { +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + if (likely(!cfg->move_desc_pending)) + return; + + /* domain is not change, but affinity is changed */ + me = smp_processor_id(); + if (cpu_isset(me, desc->affinity)) { + *descp = desc = move_irq_desc(desc, me); + /* get the new one */ + cfg = desc->chip_data; + cfg->move_desc_pending = 0; + } +#endif return; + } vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { cpumask_t cleanup_mask; +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + *descp = desc = move_irq_desc(desc, me); + /* get the new one */ + cfg = desc->chip_data; +#endif + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); cfg->move_cleanup_count = cpus_weight(cleanup_mask); send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); diff --git a/include/linux/irq.h b/include/linux/irq.h index b5749db3e5a..36a01574678 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -227,6 +227,16 @@ extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu); #endif +static inline struct irq_desc * +irq_remap_to_desc(unsigned int irq, struct irq_desc *desc) +{ +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + return irq_to_desc(irq); +#else + return desc; +#endif +} + /* * Migration helpers for obsolete names, they will go away: */ diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 681c52dbfe2..4dd5b1edac9 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -3,3 +3,4 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o +obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 8e4fce4a1b1..de210f4b7a9 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -353,6 +353,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) spin_lock(&desc->lock); mask_ack_irq(desc, irq); + desc = irq_remap_to_desc(irq, desc); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -430,6 +431,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_INPROGRESS; out: desc->chip->eoi(irq); + desc = irq_remap_to_desc(irq, desc); spin_unlock(&desc->lock); } @@ -466,12 +468,14 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) !desc->action)) { desc->status |= (IRQ_PENDING | IRQ_MASKED); mask_ack_irq(desc, irq); + desc = irq_remap_to_desc(irq, desc); goto out_unlock; } kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ desc->chip->ack(irq); + desc = irq_remap_to_desc(irq, desc); /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; @@ -532,8 +536,10 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) if (!noirqdebug) note_interrupt(irq, desc, action_ret); - if (desc->chip->eoi) + if (desc->chip->eoi) { desc->chip->eoi(irq); + desc = irq_remap_to_desc(irq, desc); + } } void @@ -568,8 +574,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, /* Uninstall? */ if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) + if (desc->chip != &no_irq_chip) { mask_ack_irq(desc, irq); + desc = irq_remap_to_desc(irq, desc); + } desc->status |= IRQ_DISABLED; desc->depth = 1; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 8aa09547f5e..f1a23069c20 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -23,7 +23,7 @@ /* * lockdep: we want to handle all irq_desc locks as a single lock-class: */ -static struct lock_class_key irq_desc_lock_class; +struct lock_class_key irq_desc_lock_class; /** * handle_bad_irq - handle spurious and unhandled irqs @@ -73,7 +73,7 @@ static struct irq_desc irq_desc_init = { #endif }; -static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) +void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) { unsigned long bytes; char *ptr; @@ -113,7 +113,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) /* * Protect the sparse_irqs: */ -static DEFINE_SPINLOCK(sparse_irq_lock); +DEFINE_SPINLOCK(sparse_irq_lock); struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly; @@ -337,8 +337,11 @@ unsigned int __do_IRQ(unsigned int irq) /* * No locking required for CPU-local interrupts: */ - if (desc->chip->ack) + if (desc->chip->ack) { desc->chip->ack(irq); + /* get new one */ + desc = irq_remap_to_desc(irq, desc); + } if (likely(!(desc->status & IRQ_DISABLED))) { action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) @@ -349,8 +352,10 @@ unsigned int __do_IRQ(unsigned int irq) } spin_lock(&desc->lock); - if (desc->chip->ack) + if (desc->chip->ack) { desc->chip->ack(irq); + desc = irq_remap_to_desc(irq, desc); + } /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 64c1c7253da..e6d0a43cc12 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -13,6 +13,11 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, unsigned long flags); +extern struct lock_class_key irq_desc_lock_class; +extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); +extern spinlock_t sparse_irq_lock; +extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; + #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); extern void register_handler_proc(unsigned int irq, struct irqaction *action); diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c new file mode 100644 index 00000000000..0178e229699 --- /dev/null +++ b/kernel/irq/numa_migrate.c @@ -0,0 +1,127 @@ +/* + * linux/kernel/irq/handle.c + * + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the core interrupt handling code. + * + * Detailed information is available in Documentation/DocBook/genericirq + * + */ + +#include +#include +#include +#include +#include + +#include "internals.h" + +static void init_copy_kstat_irqs(struct irq_desc *old_desc, + struct irq_desc *desc, + int cpu, int nr) +{ + unsigned long bytes; + + init_kstat_irqs(desc, cpu, nr); + + if (desc->kstat_irqs != old_desc->kstat_irqs) { + /* Compute how many bytes we need per irq and allocate them */ + bytes = nr * sizeof(unsigned int); + + memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes); + } +} + +static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) +{ + if (old_desc->kstat_irqs == desc->kstat_irqs) + return; + + kfree(old_desc->kstat_irqs); + old_desc->kstat_irqs = NULL; +} + +static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, + struct irq_desc *desc, int cpu) +{ + memcpy(desc, old_desc, sizeof(struct irq_desc)); + desc->cpu = cpu; + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); + arch_init_copy_chip_data(old_desc, desc, cpu); +} + +static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) +{ + free_kstat_irqs(old_desc, desc); + arch_free_chip_data(old_desc, desc); +} + +static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, + int cpu) +{ + struct irq_desc *desc; + unsigned int irq; + unsigned long flags; + int node; + + irq = old_desc->irq; + + spin_lock_irqsave(&sparse_irq_lock, flags); + + /* We have to check it to avoid races with another CPU */ + desc = irq_desc_ptrs[irq]; + + if (desc && old_desc != desc) + goto out_unlock; + + node = cpu_to_node(cpu); + desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); + printk(KERN_DEBUG " move irq_desc for %d to cpu %d node %d\n", + irq, cpu, node); + if (!desc) { + printk(KERN_ERR "can not get new irq_desc for moving\n"); + /* still use old one */ + desc = old_desc; + goto out_unlock; + } + init_copy_one_irq_desc(irq, old_desc, desc, cpu); + + irq_desc_ptrs[irq] = desc; + + /* free the old one */ + free_one_irq_desc(old_desc, desc); + kfree(old_desc); + +out_unlock: + spin_unlock_irqrestore(&sparse_irq_lock, flags); + + return desc; +} + +struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu) +{ + int old_cpu; + int node, old_node; + + /* those all static, do move them */ + if (desc->irq < NR_IRQS_LEGACY) + return desc; + + old_cpu = desc->cpu; + printk(KERN_DEBUG + "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu); + if (old_cpu != cpu) { + node = cpu_to_node(cpu); + old_node = cpu_to_node(old_cpu); + if (old_node != node) + desc = __real_move_irq_desc(desc, cpu); + else + desc->cpu = cpu; + } + + return desc; +} + -- cgit v1.2.3-70-g09d2 From 17483a1f34c970e6c2cb8c082d4441bfabbe88a9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 12 Dec 2008 13:14:18 -0800 Subject: sparseirq: fix !SMP building, #2 Impact: build fix make intr_remapping.c to include smp.h, so could use boot_cpu_id there also remove old change that disabling sparseirq with !SMP Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- drivers/pci/intr_remapping.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 29073532f94..60a008857a3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -240,7 +240,7 @@ config X86_HAS_BOOT_CPU_ID config SPARSE_IRQ bool "Support sparse irq numbering" - depends on (PCI_MSI || HT_IRQ) && SMP + depends on PCI_MSI || HT_IRQ default y help This enables support for sparse irq, esp for msi/msi-x. You may need diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index c9958ec5e25..f78371b2252 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "intr_remapping.h" -- cgit v1.2.3-70-g09d2 From b909895739427874c089bc0e03dc119f99cab2dd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 19 Dec 2008 13:48:34 -0800 Subject: sparseirq: fix numa_migrate_irq_desc dependency and comments Impact: reduce kconfig variable scope and clean up Bartlomiej pointed out that the config dependencies and comments are not right. update it depend to NUMA, and fix some comments Reported-by: Bartlomiej Zolnierkiewicz Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/kernel/io_apic.c | 2 +- kernel/irq/numa_migrate.c | 11 +++-------- 3 files changed, 5 insertions(+), 10 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 60a008857a3..5c243826334 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -250,7 +250,7 @@ config SPARSE_IRQ config NUMA_MIGRATE_IRQ_DESC bool "Move irq desc when changing irq smp_affinity" - depends on SPARSE_IRQ && SMP + depends on SPARSE_IRQ && NUMA default n help This enables moving irq_desc to cpu/node that irq will use handled. diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index bfe1245b1a3..a74887b416c 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -2471,7 +2471,7 @@ static void irq_complete_move(struct irq_desc **descp) if (likely(!cfg->move_desc_pending)) return; - /* domain is not change, but affinity is changed */ + /* domain has not changed, but affinity did */ me = smp_processor_id(); if (cpu_isset(me, desc->affinity)) { *descp = desc = move_irq_desc(desc, me); diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 0178e229699..089c3746358 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -1,13 +1,8 @@ /* - * linux/kernel/irq/handle.c - * - * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar - * Copyright (C) 2005-2006, Thomas Gleixner, Russell King - * - * This file contains the core interrupt handling code. - * - * Detailed information is available in Documentation/DocBook/genericirq + * NUMA irq-desc migration code * + * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to + * the new "home node" of the IRQ. */ #include -- cgit v1.2.3-70-g09d2 From 4e17fee24a39448f3a20e9cf98887b7665825848 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 25 Dec 2008 12:04:17 +0100 Subject: x86: turn CONFIG_SPARSE_IRQ off by default New feature - lets disable it by default first - can flip it around later. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5c243826334..d14a8806227 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -241,7 +241,6 @@ config X86_HAS_BOOT_CPU_ID config SPARSE_IRQ bool "Support sparse irq numbering" depends on PCI_MSI || HT_IRQ - default y help This enables support for sparse irq, esp for msi/msi-x. You may need if you have lots of cards supports msi-x installed. -- cgit v1.2.3-70-g09d2 From 973656fe1afb4adf95d7b9ab75d4660cd3821ea1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 25 Dec 2008 16:26:47 +0100 Subject: x86, sparseirq: clean up Kconfig entry Impact: improve help text Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86/Kconfig') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d14a8806227..e4c038abb71 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -242,10 +242,14 @@ config SPARSE_IRQ bool "Support sparse irq numbering" depends on PCI_MSI || HT_IRQ help - This enables support for sparse irq, esp for msi/msi-x. You may need - if you have lots of cards supports msi-x installed. + This enables support for sparse irqs. This is useful for distro + kernels that want to define a high CONFIG_NR_CPUS value but still + want to have low kernel memory footprint on smaller machines. - If you don't know what to do here, say Y. + ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread + out the irq_desc[] array in a more NUMA-friendly way. ) + + If you don't know what to do here, say N. config NUMA_MIGRATE_IRQ_DESC bool "Move irq desc when changing irq smp_affinity" -- cgit v1.2.3-70-g09d2