diff options
336 files changed, 11162 insertions, 8228 deletions
diff --git a/Documentation/DocBook/debugobjects.tmpl b/Documentation/DocBook/debugobjects.tmpl index 08ff908aa7a..24979f691e3 100644 --- a/Documentation/DocBook/debugobjects.tmpl +++ b/Documentation/DocBook/debugobjects.tmpl @@ -96,6 +96,7 @@ <listitem><para>debug_object_deactivate</para></listitem> <listitem><para>debug_object_destroy</para></listitem> <listitem><para>debug_object_free</para></listitem> + <listitem><para>debug_object_assert_init</para></listitem> </itemizedlist> Each of these functions takes the address of the real object and a pointer to the object type specific debug description @@ -273,6 +274,26 @@ debug checks. </para> </sect1> + + <sect1 id="debug_object_assert_init"> + <title>debug_object_assert_init</title> + <para> + This function is called to assert that an object has been + initialized. + </para> + <para> + When the real object is not tracked by debugobjects, it calls + fixup_assert_init of the object type description structure + provided by the caller, with the hardcoded object state + ODEBUG_NOT_AVAILABLE. The fixup function can correct the problem + by calling debug_object_init and other specific initializing + functions. + </para> + <para> + When the real object is already tracked by debugobjects it is + ignored. + </para> + </sect1> </chapter> <chapter id="fixupfunctions"> <title>Fixup functions</title> @@ -381,6 +402,35 @@ statistics. </para> </sect1> + <sect1 id="fixup_assert_init"> + <title>fixup_assert_init</title> + <para> + This function is called from the debug code whenever a problem + in debug_object_assert_init is detected. + </para> + <para> + Called from debug_object_assert_init() with a hardcoded state + ODEBUG_STATE_NOTAVAILABLE when the object is not found in the + debug bucket. + </para> + <para> + The function returns 1 when the fixup was successful, + otherwise 0. The return value is used to update the + statistics. + </para> + <para> + Note, this function should make sure debug_object_init() is + called before returning. + </para> + <para> + The handling of statically initialized objects is a special + case. The fixup function should check if this is a legitimate + case of a statically initialized object or not. In this case only + debug_object_init() should be called to make the object known to + the tracker. Then the function should return 0 because this is not + a real fixup. + </para> + </sect1> </chapter> <chapter id="bugs"> <title>Known Bugs And Assumptions</title> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 81c287fad79..e229769606f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1885,6 +1885,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. arch_perfmon: [X86] Force use of architectural perfmon on Intel CPUs instead of the CPU specific event set. + timer: [X86] Force use of architectural NMI + timer mode (see also oprofile.timer + for generic hr timer mode) + [s390] Force legacy basic mode sampling + (report cpu_type "timer") oops=panic Always panic on oopses. Default is to just kill the process, but there is a small probability of @@ -2750,11 +2755,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. functions are at fixed addresses, they make nice targets for exploits that can control RIP. - emulate Vsyscalls turn into traps and are emulated - reasonably safely. + emulate [default] Vsyscalls turn into traps and are + emulated reasonably safely. - native [default] Vsyscalls are native syscall - instructions. + native Vsyscalls are native syscall instructions. This is a little bit faster than trapping and makes a few dynamic recompilers work better than they would in emulation mode. diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index b510564aac7..bb24c2a0e87 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt @@ -191,8 +191,6 @@ And for string fields they are: Currently, only exact string matches are supported. -Currently, the maximum number of predicates in a filter is 16. - 5.2 Setting filters ------------------- diff --git a/arch/Kconfig b/arch/Kconfig index 4b0669cbb3b..2505740b81d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -30,6 +30,10 @@ config OPROFILE_EVENT_MULTIPLEX config HAVE_OPROFILE bool +config OPROFILE_NMI_TIMER + def_bool y + depends on PERF_EVENTS && HAVE_PERF_EVENTS_NMI + config KPROBES bool "Kprobes" depends on MODULES diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 8fc2c8fcbdc..c0b59bff6be 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -52,6 +52,7 @@ #include <asm/mach/time.h> #include <asm/traps.h> #include <asm/unwind.h> +#include <asm/memblock.h> #if defined(CONFIG_DEPRECATED_PARAM_STRUCT) #include "compat.h" diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index fbdd12ea3a5..7c38474e533 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -32,6 +32,7 @@ #include <asm/mach/arch.h> #include <asm/mach/map.h> +#include <asm/memblock.h> #include "mm.h" @@ -332,7 +333,6 @@ void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc) sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL); - memblock_init(); for (i = 0; i < mi->nr_banks; i++) memblock_add(mi->bank[i].start, mi->bank[i].size); @@ -371,7 +371,7 @@ void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc) if (mdesc->reserve) mdesc->reserve(); - memblock_analyze(); + memblock_allow_resize(); memblock_dump_all(); } diff --git a/arch/cris/arch-v32/kernel/time.c b/arch/cris/arch-v32/kernel/time.c index bb978ede898..6773fc83a67 100644 --- a/arch/cris/arch-v32/kernel/time.c +++ b/arch/cris/arch-v32/kernel/time.c @@ -47,14 +47,12 @@ static struct clocksource cont_rotime = { .rating = 300, .read = read_cont_rotime, .mask = CLOCKSOURCE_MASK(32), - .shift = 10, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static int __init etrax_init_cont_rotime(void) { - cont_rotime.mult = clocksource_khz2mult(100000, cont_rotime.shift); - clocksource_register(&cont_rotime); + clocksource_register_khz(&cont_rotime, 100000); return 0; } arch_initcall(etrax_init_cont_rotime); diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 27489b6dd53..3b7a7c48378 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -23,6 +23,9 @@ config IA64 select HAVE_ARCH_TRACEHOOK select HAVE_DMA_API_DEBUG select HAVE_GENERIC_HARDIRQS + select HAVE_MEMBLOCK + select HAVE_MEMBLOCK_NODE_MAP + select ARCH_DISCARD_MEMBLOCK select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP select IRQ_PER_CPU @@ -474,9 +477,6 @@ config NODES_SHIFT MAX_NUMNODES will be 2^(This value). If in doubt, use the default. -config ARCH_POPULATES_NODE_MAP - def_bool y - # VIRTUAL_MEM_MAP and FLAT_NODE_MEM_MAP are functionally equivalent. # VIRTUAL_MEM_MAP has been retained for historical reasons. config VIRTUAL_MEM_MAP diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h index 5a274af31b2..3deac956d32 100644 --- a/arch/ia64/include/asm/cputime.h +++ b/arch/ia64/include/asm/cputime.h @@ -26,60 +26,53 @@ #include <linux/jiffies.h> #include <asm/processor.h> -typedef u64 cputime_t; -typedef u64 cputime64_t; +typedef u64 __nocast cputime_t; +typedef u64 __nocast cputime64_t; -#define cputime_zero ((cputime_t)0) #define cputime_one_jiffy jiffies_to_cputime(1) -#define cputime_max ((~((cputime_t)0) >> 1) - 1) -#define cputime_add(__a, __b) ((__a) + (__b)) -#define cputime_sub(__a, __b) ((__a) - (__b)) -#define cputime_div(__a, __n) ((__a) / (__n)) -#define cputime_halve(__a) ((__a) >> 1) -#define cputime_eq(__a, __b) ((__a) == (__b)) -#define cputime_gt(__a, __b) ((__a) > (__b)) -#define cputime_ge(__a, __b) ((__a) >= (__b)) -#define cputime_lt(__a, __b) ((__a) < (__b)) -#define cputime_le(__a, __b) ((__a) <= (__b)) - -#define cputime64_zero ((cputime64_t)0) -#define cputime64_add(__a, __b) ((__a) + (__b)) -#define cputime64_sub(__a, __b) ((__a) - (__b)) -#define cputime_to_cputime64(__ct) (__ct) /* * Convert cputime <-> jiffies (HZ) */ -#define cputime_to_jiffies(__ct) ((__ct) / (NSEC_PER_SEC / HZ)) -#define jiffies_to_cputime(__jif) ((__jif) * (NSEC_PER_SEC / HZ)) -#define cputime64_to_jiffies64(__ct) ((__ct) / (NSEC_PER_SEC / HZ)) -#define jiffies64_to_cputime64(__jif) ((__jif) * (NSEC_PER_SEC / HZ)) +#define cputime_to_jiffies(__ct) \ + ((__force u64)(__ct) / (NSEC_PER_SEC / HZ)) +#define jiffies_to_cputime(__jif) \ + (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ)) +#define cputime64_to_jiffies64(__ct) \ + ((__force u64)(__ct) / (NSEC_PER_SEC / HZ)) +#define jiffies64_to_cputime64(__jif) \ + (__force cputime64_t)((__jif) * (NSEC_PER_SEC / HZ)) /* * Convert cputime <-> microseconds */ -#define cputime_to_usecs(__ct) ((__ct) / NSEC_PER_USEC) -#define usecs_to_cputime(__usecs) ((__usecs) * NSEC_PER_USEC) -#define usecs_to_cputime64(__usecs) usecs_to_cputime(__usecs) +#define cputime_to_usecs(__ct) \ + ((__force u64)(__ct) / NSEC_PER_USEC) +#define usecs_to_cputime(__usecs) \ + (__force cputime_t)((__usecs) * NSEC_PER_USEC) +#define usecs_to_cputime64(__usecs) \ + (__force cputime64_t)((__usecs) * NSEC_PER_USEC) /* * Convert cputime <-> seconds */ -#define cputime_to_secs(__ct) ((__ct) / NSEC_PER_SEC) -#define secs_to_cputime(__secs) ((__secs) * NSEC_PER_SEC) +#define cputime_to_secs(__ct) \ + ((__force u64)(__ct) / NSEC_PER_SEC) +#define secs_to_cputime(__secs) \ + (__force cputime_t)((__secs) * NSEC_PER_SEC) /* * Convert cputime <-> timespec (nsec) */ static inline cputime_t timespec_to_cputime(const struct timespec *val) { - cputime_t ret = val->tv_sec * NSEC_PER_SEC; - return (ret + val->tv_nsec); + u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_nsec; + return (__force cputime_t) ret; } static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val) { - val->tv_sec = ct / NSEC_PER_SEC; - val->tv_nsec = ct % NSEC_PER_SEC; + val->tv_sec = (__force u64) ct / NSEC_PER_SEC; + val->tv_nsec = (__force u64) ct % NSEC_PER_SEC; } /* @@ -87,25 +80,28 @@ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val) */ static inline cputime_t timeval_to_cputime(struct timeval *val) { - cputime_t ret = val->tv_sec * NSEC_PER_SEC; - return (ret + val->tv_usec * NSEC_PER_USEC); + u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_usec * NSEC_PER_USEC; + return (__force cputime_t) ret; } static inline void cputime_to_timeval(const cputime_t ct, struct timeval *val) { - val->tv_sec = ct / NSEC_PER_SEC; - val->tv_usec = (ct % NSEC_PER_SEC) / NSEC_PER_USEC; + val->tv_sec = (__force u64) ct / NSEC_PER_SEC; + val->tv_usec = ((__force u64) ct % NSEC_PER_SEC) / NSEC_PER_USEC; } /* * Convert cputime <-> clock (USER_HZ) */ -#define cputime_to_clock_t(__ct) ((__ct) / (NSEC_PER_SEC / USER_HZ)) -#define clock_t_to_cputime(__x) ((__x) * (NSEC_PER_SEC / USER_HZ)) +#define cputime_to_clock_t(__ct) \ + ((__force u64)(__ct) / (NSEC_PER_SEC / USER_HZ)) +#define clock_t_to_cputime(__x) \ + (__force cputime_t)((__x) * (NSEC_PER_SEC / USER_HZ)) /* * Convert cputime64 to clock. */ -#define cputime64_to_clock_t(__ct) cputime_to_clock_t((cputime_t)__ct) +#define cputime64_to_clock_t(__ct) \ + cputime_to_clock_t((__force cputime_t)__ct) #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ #endif /* __IA64_CPUTIME_H */ diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index f114a3b14c6..1516d1dc11f 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -16,6 +16,7 @@ */ #include <linux/bootmem.h> #include <linux/efi.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/nmi.h> #include <linux/swap.h> @@ -348,7 +349,7 @@ paging_init (void) printk("Virtual mem_map starts at 0x%p\n", mem_map); } #else /* !CONFIG_VIRTUAL_MEM_MAP */ - add_active_range(0, 0, max_low_pfn); + memblock_add_node(0, PFN_PHYS(max_low_pfn), 0); free_area_init_nodes(max_zone_pfns); #endif /* !CONFIG_VIRTUAL_MEM_MAP */ zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 00cb0e26c64..13df239dbed 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -10,6 +10,7 @@ #include <linux/bootmem.h> #include <linux/efi.h> #include <linux/elf.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/module.h> @@ -557,8 +558,7 @@ int __init register_active_ranges(u64 start, u64 len, int nid) #endif if (start < end) - add_active_range(nid, __pa(start) >> PAGE_SHIFT, - __pa(end) >> PAGE_SHIFT); + memblock_add_node(__pa(start), end - start, nid); return 0; } diff --git a/arch/m68k/platform/68328/timers.c b/arch/m68k/platform/68328/timers.c index 309f725995b..f2678866067 100644 --- a/arch/m68k/platform/68328/timers.c +++ b/arch/m68k/platform/68328/timers.c @@ -93,7 +93,6 @@ static struct clocksource m68328_clk = { .name = "timer", .rating = 250, .read = m68328_read_clk, - .shift = 20, .mask = CLOCKSOURCE_MASK(32), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -115,8 +114,7 @@ void hw_timer_init(void) /* Enable timer 1 */ TCTL |= TCTL_TEN; - m68328_clk.mult = clocksource_hz2mult(TICKS_PER_JIFFY*HZ, m68328_clk.shift); - clocksource_register(&m68328_clk); + clocksource_register_hz(&m68328_clk, TICKS_PER_JIFFY*HZ); } /***************************************************************************/ diff --git a/arch/m68k/platform/coldfire/dma_timer.c b/arch/m68k/platform/coldfire/dma_timer.c index a5f562823d7..235ad57c470 100644 --- a/arch/m68k/platform/coldfire/dma_timer.c +++ b/arch/m68k/platform/coldfire/dma_timer.c @@ -44,7 +44,6 @@ static struct clocksource clocksource_cf_dt = { .rating = 200, .read = cf_dt_get_cycles, .mask = CLOCKSOURCE_MASK(32), - .shift = 20, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -60,9 +59,7 @@ static int __init init_cf_dt_clocksource(void) __raw_writeb(0x00, DTER0); __raw_writel(0x00000000, DTRR0); __raw_writew(DMA_DTMR_CLK_DIV_16 | DMA_DTMR_ENABLE, DTMR0); - clocksource_cf_dt.mult = clocksource_hz2mult(DMA_FREQ, - clocksource_cf_dt.shift); - return clocksource_register(&clocksource_cf_dt); + return clocksource_register_hz(&clocksource_cf_dt, DMA_FREQ); } arch_initcall(init_cf_dt_clocksource); diff --git a/arch/m68k/platform/coldfire/pit.c b/arch/m68k/platform/coldfire/pit.c index c2b980926be..02663d25822 100644 --- a/arch/m68k/platform/coldfire/pit.c +++ b/arch/m68k/platform/coldfire/pit.c @@ -144,7 +144,6 @@ static struct clocksource pit_clk = { .name = "pit", .rating = 100, .read = pit_read_clk, - .shift = 20, .mask = CLOCKSOURCE_MASK(32), }; @@ -162,8 +161,7 @@ void hw_timer_init(void) setup_irq(MCFINT_VECBASE + MCFINT_PIT1, &pit_irq); - pit_clk.mult = clocksource_hz2mult(FREQ, pit_clk.shift); - clocksource_register(&pit_clk); + clocksource_register_hz(&pit_clk, FREQ); } /***************************************************************************/ diff --git a/arch/m68k/platform/coldfire/sltimers.c b/arch/m68k/platform/coldfire/sltimers.c index 6a85daf9a7f..b7f822b552b 100644 --- a/arch/m68k/platform/coldfire/sltimers.c +++ b/arch/m68k/platform/coldfire/sltimers.c @@ -114,7 +114,6 @@ static struct clocksource mcfslt_clk = { .name = "slt", .rating = 250, .read = mcfslt_read_clk, - .shift = 20, .mask = CLOCKSOURCE_MASK(32), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -136,8 +135,7 @@ void hw_timer_init(void) setup_irq(MCF_IRQ_TIMER, &mcfslt_timer_irq); - mcfslt_clk.mult = clocksource_hz2mult(MCF_BUSCLK, mcfslt_clk.shift); - clocksource_register(&mcfslt_clk); + clocksource_register_hz(&mcfslt_clk, MCF_BUSCLK); #ifdef CONFIG_HIGHPROFILE mcfslt_profile_init(); diff --git a/arch/m68k/platform/coldfire/timers.c b/arch/m68k/platform/coldfire/timers.c index 60242f65fea..0d90da32fcd 100644 --- a/arch/m68k/platform/coldfire/timers.c +++ b/arch/m68k/platform/coldfire/timers.c @@ -88,7 +88,6 @@ static struct clocksource mcftmr_clk = { .name = "tmr", .rating = 250, .read = mcftmr_read_clk, - .shift = 20, .mask = CLOCKSOURCE_MASK(32), .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -109,8 +108,7 @@ void hw_timer_init(void) __raw_writew(MCFTIMER_TMR_ENORI | MCFTIMER_TMR_CLK16 | MCFTIMER_TMR_RESTART | MCFTIMER_TMR_ENABLE, TA(MCFTIMER_TMR)); - mcftmr_clk.mult = clocksource_hz2mult(FREQ, mcftmr_clk.shift); - clocksource_register(&mcftmr_clk); + clocksource_register_hz(&mcftmr_clk, FREQ); setup_irq(MCF_IRQ_TIMER, &mcftmr_timer_irq); diff --git a/arch/microblaze/include/asm/memblock.h b/arch/microblaze/include/asm/memblock.h deleted file mode 100644 index 20a8e257c77..00000000000 --- a/arch/microblaze/include/asm/memblock.h +++ /dev/null @@ -1,14 +0,0 @@ -/* - * Copyright (C) 2008 Michal Simek <monstr@monstr.eu> - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ - -#ifndef _ASM_MICROBLAZE_MEMBLOCK_H -#define _ASM_MICROBLAZE_MEMBLOCK_H - -#endif /* _ASM_MICROBLAZE_MEMBLOCK_H */ - - diff --git a/arch/microblaze/kernel/prom.c b/arch/microblaze/kernel/prom.c index 977484add21..80d314e8190 100644 --- a/arch/microblaze/kernel/prom.c +++ b/arch/microblaze/kernel/prom.c @@ -122,7 +122,6 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_chosen, cmd_line); /* Scan memory nodes and rebuild MEMBLOCKs */ - memblock_init(); of_scan_flat_dt(early_init_dt_scan_root, NULL); of_scan_flat_dt(early_init_dt_scan_memory, NULL); @@ -130,7 +129,7 @@ void __init early_init_devtree(void *params) strlcpy(boot_command_line, cmd_line, COMMAND_LINE_SIZE); parse_early_param(); - memblock_analyze(); + memblock_allow_resize(); pr_debug("Phys. mem: %lx\n", (unsigned long) memblock_phys_mem_size()); diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index d46f1da18a3..9c652eb68aa 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -25,6 +25,9 @@ config MIPS select GENERIC_IRQ_SHOW select HAVE_ARCH_JUMP_LABEL select IRQ_FORCED_THREADING + select HAVE_MEMBLOCK + select HAVE_MEMBLOCK_NODE_MAP + select ARCH_DISCARD_MEMBLOCK menu "Machine selection" @@ -2064,9 +2067,6 @@ config ARCH_DISCONTIGMEM_ENABLE or have huge holes in the physical address space for other reasons. See <file:Documentation/vm/numa> for more. -config ARCH_POPULATES_NODE_MAP - def_bool y - config ARCH_SPARSEMEM_ENABLE bool select SPARSEMEM_STATIC diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 84af26ab221..b1cb8f87d7b 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -14,6 +14,7 @@ #include <linux/ioport.h> #include <linux/export.h> #include <linux/screen_info.h> +#include <linux/memblock.h> #include <linux/bootmem.h> #include <linux/initrd.h> #include <linux/root_dev.h> @@ -352,7 +353,7 @@ static void __init bootmem_init(void) continue; #endif - add_active_range(0, start, end); + memblock_add_node(PFN_PHYS(start), PFN_PHYS(end - start), 0); } /* diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index bc1297109cc..b105eca3c02 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -12,6 +12,7 @@ */ #include <linux/init.h> #include <linux/kernel.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/module.h> @@ -381,8 +382,8 @@ static void __init szmem(void) continue; } num_physpages += slot_psize; - add_active_range(node, slot_getbasepfn(node, slot), - slot_getbasepfn(node, slot) + slot_psize); + memblock_add_node(PFN_PHYS(slot_getbasepfn(node, slot)), + PFN_PHYS(slot_psize), node); } } } diff --git a/arch/openrisc/include/asm/memblock.h b/arch/openrisc/include/asm/memblock.h deleted file mode 100644 index bbe5a1c788c..00000000000 --- a/arch/openrisc/include/asm/memblock.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * OpenRISC Linux - * - * Linux architectural port borrowing liberally from similar works of - * others. All original copyrights apply as per the original source - * declaration. - * - * OpenRISC implementation: - * Copyright (C) 2003 Matjaz Breskvar <phoenix@bsemi.com> - * Copyright (C) 2010-2011 Jonas Bonn <jonas@southpole.se> - * et al. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#ifndef __ASM_OPENRISC_MEMBLOCK_H -#define __ASM_OPENRISC_MEMBLOCK_H - -/* empty */ - -#endif /* __ASM_OPENRISC_MEMBLOCK_H */ diff --git a/arch/openrisc/kernel/prom.c b/arch/openrisc/kernel/prom.c index 1bb58ba89af..3d4478f6c94 100644 --- a/arch/openrisc/kernel/prom.c +++ b/arch/openrisc/kernel/prom.c @@ -76,14 +76,13 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_chosen, cmd_line); /* Scan memory nodes and rebuild MEMBLOCKs */ - memblock_init(); of_scan_flat_dt(early_init_dt_scan_root, NULL); of_scan_flat_dt(early_init_dt_scan_memory, NULL); /* Save command line for /proc/cmdline and then parse parameters */ strlcpy(boot_command_line, cmd_line, COMMAND_LINE_SIZE); - memblock_analyze(); + memblock_allow_resize(); /* We must copy the flattend device tree from init memory to regular * memory because the device tree references the strings in it diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 45b7389d77a..7c0774397b8 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -198,8 +198,6 @@ static struct clocksource clocksource_cr16 = { .rating = 300, .read = read_cr16, .mask = CLOCKSOURCE_MASK(BITS_PER_LONG), - .mult = 0, /* to be set */ - .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -270,7 +268,5 @@ void __init time_init(void) /* register at clocksource framework */ current_cr16_khz = PAGE0->mem_10msec/10; /* kHz */ - clocksource_cr16.mult = clocksource_khz2mult(current_cr16_khz, - clocksource_cr16.shift); - clocksource_register(&clocksource_cr16); + clocksource_register_khz(&clocksource_cr16, current_cr16_khz); } diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 951e18f5335..ead0bc68439 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -117,6 +117,7 @@ config PPC select HAVE_KRETPROBES select HAVE_ARCH_TRACEHOOK select HAVE_MEMBLOCK + select HAVE_MEMBLOCK_NODE_MAP select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG select USE_GENERIC_SMP_HELPERS if SMP @@ -421,9 +422,6 @@ config ARCH_SPARSEMEM_DEFAULT def_bool y depends on (SMP && PPC_PSERIES) || PPC_PS3 -config ARCH_POPULATES_NODE_MAP - def_bool y - config SYS_SUPPORTS_HUGETLBFS bool diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 98b7c4b49c9..6ec1c380a4d 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -29,25 +29,8 @@ static inline void setup_cputime_one_jiffy(void) { } #include <asm/time.h> #include <asm/param.h> -typedef u64 cputime_t; -typedef u64 cputime64_t; - -#define cputime_zero ((cputime_t)0) -#define cputime_max ((~((cputime_t)0) >> 1) - 1) -#define cputime_add(__a, __b) ((__a) + (__b)) -#define cputime_sub(__a, __b) ((__a) - (__b)) -#define cputime_div(__a, __n) ((__a) / (__n)) -#define cputime_halve(__a) ((__a) >> 1) -#define cputime_eq(__a, __b) ((__a) == (__b)) -#define cputime_gt(__a, __b) ((__a) > (__b)) -#define cputime_ge(__a, __b) ((__a) >= (__b)) -#define cputime_lt(__a, __b) ((__a) < (__b)) -#define cputime_le(__a, __b) ((__a) <= (__b)) - -#define cputime64_zero ((cputime64_t)0) -#define cputime64_add(__a, __b) ((__a) + (__b)) -#define cputime64_sub(__a, __b) ((__a) - (__b)) -#define cputime_to_cputime64(__ct) (__ct) +typedef u64 __nocast cputime_t; +typedef u64 __nocast cputime64_t; #ifdef __KERNEL__ @@ -65,7 +48,7 @@ DECLARE_PER_CPU(unsigned long, cputime_scaled_last_delta); static inline unsigned long cputime_to_jiffies(const cputime_t ct) { - return mulhdu(ct, __cputime_jiffies_factor); + return mulhdu((__force u64) ct, __cputime_jiffies_factor); } /* Estimate the scaled cputime by scaling the real cputime based on @@ -74,14 +57,15 @@ static inline cputime_t cputime_to_scaled(const cputime_t ct) { if (cpu_has_feature(CPU_FTR_SPURR) && __get_cpu_var(cputime_last_delta)) - return ct * __get_cpu_var(cputime_scaled_last_delta) / - __get_cpu_var(cputime_last_delta); + return (__force u64) ct * + __get_cpu_var(cputime_scaled_last_delta) / + __get_cpu_var(cputime_last_delta); return ct; } static inline cputime_t jiffies_to_cputime(const unsigned long jif) { - cputime_t ct; + u64 ct; unsigned long sec; /* have to be a little careful about overflow */ @@ -93,7 +77,7 @@ static inline cputime_t jiffies_to_cputime(const unsigned long jif) } if (sec) ct += (cputime_t) sec * tb_ticks_per_sec; - return ct; + return (__force cputime_t) ct; } static inline void setup_cputime_one_jiffy(void) @@ -103,7 +87,7 @@ static inline void setup_cputime_one_jiffy(void) static inline cputime64_t jiffies64_to_cputime64(const u64 jif) { - cputime_t ct; + u64 ct; u64 sec; /* have to be a little careful about overflow */ @@ -114,13 +98,13 @@ static inline cputime64_t jiffies64_to_cputime64(const u64 jif) do_div(ct, HZ); } if (sec) - ct += (cputime_t) sec * tb_ticks_per_sec; - return ct; + ct += (u64) sec * tb_ticks_per_sec; + return (__force cputime64_t) ct; } static inline u64 cputime64_to_jiffies64(const cputime_t ct) { - return mulhdu(ct, __cputime_jiffies_factor); + return mulhdu((__force u64) ct, __cputime_jiffies_factor); } /* @@ -130,12 +114,12 @@ extern u64 __cputime_msec_factor; static inline unsigned long cputime_to_usecs(const cputime_t ct) { - return mulhdu(ct, __cputime_msec_factor) * USEC_PER_MSEC; + return mulhdu((__force u64) ct, __cputime_msec_factor) * USEC_PER_MSEC; } static inline cputime_t usecs_to_cputime(const unsigned long us) { - cputime_t ct; + u64 ct; unsigned long sec; /* have to be a little careful about overflow */ @@ -147,7 +131,7 @@ static inline cputime_t usecs_to_cputime(const unsigned long us) } if (sec) ct += (cputime_t) sec * tb_ticks_per_sec; - return ct; + return (__force cputime_t) ct; } #define usecs_to_cputime64(us) usecs_to_cputime(us) @@ -159,12 +143,12 @@ extern u64 __cputime_sec_factor; static inline unsigned long cputime_to_secs(const cputime_t ct) { - return mulhdu(ct, __cputime_sec_factor); + return mulhdu((__force u64) ct, __cputime_sec_factor); } static inline cputime_t secs_to_cputime(const unsigned long sec) { - return (cputime_t) sec * tb_ticks_per_sec; + return (__force cputime_t)((u64) sec * tb_ticks_per_sec); } /* @@ -172,7 +156,7 @@ static inline cputime_t secs_to_cputime(const unsigned long sec) */ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *p) { - u64 x = ct; + u64 x = (__force u64) ct; unsigned int frac; frac = do_div(x, tb_ticks_per_sec); @@ -184,11 +168,11 @@ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *p) static inline cputime_t timespec_to_cputime(const struct timespec *p) { - cputime_t ct; + u64 ct; ct = (u64) p->tv_nsec * tb_ticks_per_sec; do_div(ct, 1000000000); - return ct + (u64) p->tv_sec * tb_ticks_per_sec; + return (__force cputime_t)(ct + (u64) p->tv_sec * tb_ticks_per_sec); } /* @@ -196,7 +180,7 @@ static inline cputime_t timespec_to_cputime(const struct timespec *p) */ static inline void cputime_to_timeval(const cputime_t ct, struct timeval *p) { - u64 x = ct; + u64 x = (__force u64) ct; unsigned int frac; frac = do_div(x, tb_ticks_per_sec); @@ -208,11 +192,11 @@ static inline void cputime_to_timeval(const cputime_t ct, struct timeval *p) static inline cputime_t timeval_to_cputime(const struct timeval *p) { - cputime_t ct; + u64 ct; ct = (u64) p->tv_usec * tb_ticks_per_sec; do_div(ct, 1000000); - return ct + (u64) p->tv_sec * tb_ticks_per_sec; + return (__force cputime_t)(ct + (u64) p->tv_sec * tb_ticks_per_sec); } /* @@ -222,12 +206,12 @@ extern u64 __cputime_clockt_factor; static inline unsigned long cputime_to_clock_t(const cputime_t ct) { - return mulhdu(ct, __cputime_clockt_factor); + return mulhdu((__force u64) ct, __cputime_clockt_factor); } static inline cputime_t clock_t_to_cputime(const unsigned long clk) { - cputime_t ct; + u64 ct; unsigned long sec; /* have to be a little careful about overflow */ @@ -238,8 +222,8 @@ static inline cputime_t clock_t_to_cputime(const unsigned long clk) do_div(ct, USER_HZ); } if (sec) - ct += (cputime_t) sec * tb_ticks_per_sec; - return ct; + ct += (u64) sec * tb_ticks_per_sec; + return (__force cputime_t) ct; } #define cputime64_to_clock_t(ct) cputime_to_clock_t((cputime_t)(ct)) diff --git a/arch/powerpc/include/asm/memblock.h b/arch/powerpc/include/asm/memblock.h deleted file mode 100644 index 43efc345065..00000000000 --- a/arch/powerpc/include/asm/memblock.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _ASM_POWERPC_MEMBLOCK_H -#define _ASM_POWERPC_MEMBLOCK_H - -#include <asm/udbg.h> - -#define MEMBLOCK_DBG(fmt...) udbg_printf(fmt) - -#endif /* _ASM_POWERPC_MEMBLOCK_H */ diff --git a/arch/powerpc/kernel/machine_kexec.c b/arch/powerpc/kernel/machine_kexec.c index 9ce1672afb5..a2158a395d9 100644 --- a/arch/powerpc/kernel/machine_kexec.c +++ b/arch/powerpc/kernel/machine_kexec.c @@ -107,9 +107,6 @@ void __init reserve_crashkernel(void) unsigned long long crash_size, crash_base; int ret; - /* this is necessary because of memblock_phys_mem_size() */ - memblock_analyze(); - /* use common parsing */ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base); diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index fa1235b0503..abe405dab34 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -733,8 +733,6 @@ void __init early_init_devtree(void *params) of_scan_flat_dt(early_init_dt_scan_chosen_ppc, cmd_line); /* Scan memory nodes and rebuild MEMBLOCKs */ - memblock_init(); - of_scan_flat_dt(early_init_dt_scan_root, NULL); of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL); @@ -756,20 +754,14 @@ void __init early_init_devtree(void *params) early_reserve_mem(); phyp_dump_reserve_mem(); - limit = memory_limit; - if (! limit) { - phys_addr_t memsize; - - /* Ensure that total memory size is page-aligned, because - * otherwise mark_bootmem() gets upset. */ - memblock_analyze(); - memsize = memblock_phys_mem_size(); - if ((memsize & PAGE_MASK) != memsize) - limit = memsize & PAGE_MASK; - } + /* + * Ensure that total memory size is page-aligned, because otherwise + * mark_bootmem() gets upset. + */ + limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE); memblock_enforce_memory_limit(limit); - memblock_analyze(); + memblock_allow_resize(); memblock_dump_all(); DBG("Phys. mem: %llx\n", memblock_phys_mem_size()); diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 161cefde5c1..58861fa1220 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -134,8 +134,7 @@ void __init MMU_init(void) if (memblock.memory.cnt > 1) { #ifndef CONFIG_WII - memblock.memory.cnt = 1; - memblock_analyze(); + memblock_enforce_memory_limit(memblock.memory.regions[0].size); printk(KERN_WARNING "Only using first contiguous memory region"); #else wii_memory_fixups(); @@ -158,7 +157,6 @@ void __init MMU_init(void) #ifndef CONFIG_HIGHMEM total_memory = total_lowmem; memblock_enforce_memory_limit(total_lowmem); - memblock_analyze(); #endif /* CONFIG_HIGHMEM */ } diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 2dd6bdd31fe..8e2eb6611b0 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -199,7 +199,7 @@ void __init do_init_bootmem(void) unsigned long start_pfn, end_pfn; start_pfn = memblock_region_memory_base_pfn(reg); end_pfn = memblock_region_memory_end_pfn(reg); - add_active_range(0, start_pfn, end_pfn); + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); } /* Add all physical memory to the bootmem map, mark each area diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index b22a83a91cb..e6eea0ac80c 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -127,45 +127,25 @@ static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn, } /* - * get_active_region_work_fn - A helper function for get_node_active_region - * Returns datax set to the start_pfn and end_pfn if they contain - * the initial value of datax->start_pfn between them - * @start_pfn: start page(inclusive) of region to check - * @end_pfn: end page(exclusive) of region to check - * @datax: comes in with ->start_pfn set to value to search for and - * goes out with active range if it contains it - * Returns 1 if search value is in range else 0 - */ -static int __init get_active_region_work_fn(unsigned long start_pfn, - unsigned long end_pfn, void *datax) -{ - struct node_active_region *data; - data = (struct node_active_region *)datax; - - if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) { - data->start_pfn = start_pfn; - data->end_pfn = end_pfn; - return 1; - } - return 0; - -} - -/* - * get_node_active_region - Return active region containing start_pfn + * get_node_active_region - Return active region containing pfn * Active range returned is empty if none found. - * @start_pfn: The page to return the region for. - * @node_ar: Returned set to the active region containing start_pfn + * @pfn: The page to return the region for + * @node_ar: Returned set to the active region containing @pfn */ -static void __init get_node_active_region(unsigned long start_pfn, - struct node_active_region *node_ar) +static void __init get_node_active_region(unsigned long pfn, + struct node_active_region *node_ar) { - int nid = early_pfn_to_nid(start_pfn); + unsigned long start_pfn, end_pfn; + int i, nid; - node_ar->nid = nid; - node_ar->start_pfn = start_pfn; - node_ar->end_pfn = start_pfn; - work_with_active_regions(nid, get_active_region_work_fn, node_ar); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + if (pfn >= start_pfn && pfn < end_pfn) { + node_ar->nid = nid; + node_ar->start_pfn = start_pfn; + node_ar->end_pfn = end_pfn; + break; + } + } } static void map_cpu_to_node(int cpu, int node) @@ -710,9 +690,7 @@ static void __init parse_drconf_memory(struct device_node *memory) node_set_online(nid); sz = numa_enforce_memory_limit(base, size); if (sz) - add_active_range(nid, base >> PAGE_SHIFT, - (base >> PAGE_SHIFT) - + (sz >> PAGE_SHIFT)); + memblock_set_node(base, sz, nid); } while (--ranges); } } @@ -802,8 +780,7 @@ new_range: continue; } - add_active_range(nid, start >> PAGE_SHIFT, - (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT)); + memblock_set_node(start, size, nid); if (--ranges) goto new_range; @@ -839,7 +816,8 @@ static void __init setup_nonnuma(void) end_pfn = memblock_region_memory_end_pfn(reg); fake_numa_create_new_node(end_pfn, &nid); - add_active_range(nid, start_pfn, end_pfn); + memblock_set_node(PFN_PHYS(start_pfn), + PFN_PHYS(end_pfn - start_pfn), nid); node_set_online(nid); } } diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index 4e13d6f9023..573ba3b69d1 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -615,7 +615,6 @@ static void __early_init_mmu(int boot_cpu) /* limit memory so we dont have linear faults */ memblock_enforce_memory_limit(linear_map_top); - memblock_analyze(); patch_exception(0x1c0, exc_data_tlb_miss_bolted_book3e); patch_exception(0x1e0, exc_instruction_tlb_miss_bolted_book3e); diff --git a/arch/powerpc/platforms/embedded6xx/wii.c b/arch/powerpc/platforms/embedded6xx/wii.c index 1b5dc1a2e14..6d8dadf19f0 100644 --- a/arch/powerpc/platforms/embedded6xx/wii.c +++ b/arch/powerpc/platforms/embedded6xx/wii.c @@ -79,24 +79,19 @@ void __init wii_memory_fixups(void) BUG_ON(memblock.memory.cnt != 2); BUG_ON(!page_aligned(p[0].base) || !page_aligned(p[1].base)); - p[0].size = _ALIGN_DOWN(p[0].size, PAGE_SIZE); - p[1].size = _ALIGN_DOWN(p[1].size, PAGE_SIZE); + /* trim unaligned tail */ + memblock_remove(ALIGN(p[1].base + p[1].size, PAGE_SIZE), + (phys_addr_t)ULLONG_MAX); - wii_hole_start = p[0].base + p[0].size; + /* determine hole, add & reserve them */ + wii_hole_start = ALIGN(p[0].base + p[0].size, PAGE_SIZE); wii_hole_size = p[1].base - wii_hole_start; - - pr_info("MEM1: <%08llx %08llx>\n", p[0].base, p[0].size); - pr_info("HOLE: <%08lx %08lx>\n", wii_hole_start, wii_hole_size); - pr_info("MEM2: <%08llx %08llx>\n", p[1].base, p[1].size); - - p[0].size += wii_hole_size + p[1].size; - - memblock.memory.cnt = 1; - memblock_analyze(); - - /* reserve the hole */ + memblock_add(wii_hole_start, wii_hole_size); memblock_reserve(wii_hole_start, wii_hole_size); + BUG_ON(memblock.memory.cnt != 1); + __memblock_dump_all(); + /* allow ioremapping the address space in the hole */ __allow_ioremap_reserved = 1; } diff --git a/arch/powerpc/platforms/ps3/mm.c b/arch/powerpc/platforms/ps3/mm.c index 72714ad2784..8bd6ba54269 100644 --- a/arch/powerpc/platforms/ps3/mm.c +++ b/arch/powerpc/platforms/ps3/mm.c @@ -319,7 +319,6 @@ static int __init ps3_mm_add_memory(void) } memblock_add(start_addr, map.r1.size); - memblock_analyze(); result = online_pages(start_pfn, nr_pages); diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 373679b3744..d48ede33443 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -92,6 +92,9 @@ config S390 select HAVE_ARCH_JUMP_LABEL if !MARCH_G5 select HAVE_RCU_TABLE_FREE if SMP select ARCH_SAVE_PAGE_KEYS if HIBERNATION + select HAVE_MEMBLOCK + select HAVE_MEMBLOCK_NODE_MAP + select ARCH_DISCARD_MEMBLOCK select ARCH_INLINE_SPIN_TRYLOCK select ARCH_INLINE_SPIN_TRYLOCK_BH select ARCH_INLINE_SPIN_LOCK @@ -345,9 +348,6 @@ config WARN_DYNAMIC_STACK Say N if you are unsure. -config ARCH_POPULATES_NODE_MAP - def_bool y - comment "Kernel preemption" source "kernel/Kconfig.preempt" diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c index 92f1cb745d6..4de031d6b76 100644 --- a/arch/s390/appldata/appldata_os.c +++ b/arch/s390/appldata/appldata_os.c @@ -115,21 +115,21 @@ static void appldata_get_os_data(void *data) j = 0; for_each_online_cpu(i) { os_data->os_cpu[j].per_cpu_user = - cputime_to_jiffies(kstat_cpu(i).cpustat.user); + cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_USER]); os_data->os_cpu[j].per_cpu_nice = - cputime_to_jiffies(kstat_cpu(i).cpustat.nice); + cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_NICE]); os_data->os_cpu[j].per_cpu_system = - cputime_to_jiffies(kstat_cpu(i).cpustat.system); + cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]); os_data->os_cpu[j].per_cpu_idle = - cputime_to_jiffies(kstat_cpu(i).cpustat.idle); + cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IDLE]); os_data->os_cpu[j].per_cpu_irq = - cputime_to_jiffies(kstat_cpu(i).cpustat.irq); + cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IRQ]); os_data->os_cpu[j].per_cpu_softirq = - cputime_to_jiffies(kstat_cpu(i).cpustat.softirq); + cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]); os_data->os_cpu[j].per_cpu_iowait = - cputime_to_jiffies(kstat_cpu(i).cpustat.iowait); + cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_IOWAIT]); os_data->os_cpu[j].per_cpu_steal = - cputime_to_jiffies(kstat_cpu(i).cpustat.steal); + cputime_to_jiffies(kcpustat_cpu(i).cpustat[CPUTIME_STEAL]); os_data->os_cpu[j].cpu_id = i; j++; } diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index b9acaaa175d..c23c3900c30 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -16,75 +16,60 @@ /* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ -typedef unsigned long long cputime_t; -typedef unsigned long long cputime64_t; +typedef unsigned long long __nocast cputime_t; +typedef unsigned long long __nocast cputime64_t; -#ifndef __s390x__ - -static inline unsigned int -__div(unsigned long long n, unsigned int base) +static inline unsigned long __div(unsigned long long n, unsigned long base) { +#ifndef __s390x__ register_pair rp; rp.pair = n >> 1; asm ("dr %0,%1" : "+d" (rp) : "d" (base >> 1)); return rp.subreg.odd; +#else /* __s390x__ */ + return n / base; +#endif /* __s390x__ */ } -#else /* __s390x__ */ +#define cputime_one_jiffy jiffies_to_cputime(1) -static inline unsigned int -__div(unsigned long long n, unsigned int base) +/* + * Convert cputime to jiffies and back. + */ +static inline unsigned long cputime_to_jiffies(const cputime_t cputime) { - return n / base; + return __div((__force unsigned long long) cputime, 4096000000ULL / HZ); } -#endif /* __s390x__ */ +static inline cputime_t jiffies_to_cputime(const unsigned int jif) +{ + return (__force cputime_t)(jif * (4096000000ULL / HZ)); +} -#define cputime_zero (0ULL) -#define cputime_one_jiffy jiffies_to_cputime(1) -#define cputime_max ((~0UL >> 1) - 1) -#define cputime_add(__a, __b) ((__a) + (__b)) -#define cputime_sub(__a, __b) ((__a) - (__b)) -#define cputime_div(__a, __n) ({ \ - unsigned long long __div = (__a); \ - do_div(__div,__n); \ - __div; \ -}) -#define cputime_halve(__a) ((__a) >> 1) -#define cputime_eq(__a, __b) ((__a) == (__b)) -#define cputime_gt(__a, __b) ((__a) > (__b)) -#define cputime_ge(__a, __b) ((__a) >= (__b)) -#define cputime_lt(__a, __b) ((__a) < (__b)) -#define cputime_le(__a, __b) ((__a) <= (__b)) -#define cputime_to_jiffies(__ct) (__div((__ct), 4096000000ULL / HZ)) -#define cputime_to_scaled(__ct) (__ct) -#define jiffies_to_cputime(__hz) ((cputime_t)(__hz) * (4096000000ULL / HZ)) - -#define cputime64_zero (0ULL) -#define cputime64_add(__a, __b) ((__a) + (__b)) -#define cputime_to_cputime64(__ct) (__ct) - -static inline u64 -cputime64_to_jiffies64(cputime64_t cputime) -{ - do_div(cputime, 4096000000ULL / HZ); - return cputime; +static inline u64 cputime64_to_jiffies64(cputime64_t cputime) +{ + unsigned long long jif = (__force unsigned long long) cputime; + do_div(jif, 4096000000ULL / HZ); + return jif; +} + +static inline cputime64_t jiffies64_to_cputime64(const u64 jif) +{ + return (__force cputime64_t)(jif * (4096000000ULL / HZ)); } /* * Convert cputime to microseconds and back. */ -static inline unsigned int -cputime_to_usecs(const cputime_t cputime) +static inline unsigned int cputime_to_usecs(const cputime_t cputime) { - return cputime_div(cputime, 4096); + return (__force unsigned long long) cputime >> 12; } -static inline cputime_t -usecs_to_cputime(const unsigned int m) +static inline cputime_t usecs_to_cputime(const unsigned int m) { - return (cputime_t) m * 4096; + return (__force cputime_t)(m * 4096ULL); } #define usecs_to_cputime64(m) usecs_to_cputime(m) @@ -92,40 +77,39 @@ usecs_to_cputime(const unsigned int m) /* * Convert cputime to milliseconds and back. */ -static inline unsigned int -cputime_to_secs(const cputime_t cputime) +static inline unsigned int cputime_to_secs(const cputime_t cputime) { - return __div(cputime, 2048000000) >> 1; + return __div((__force unsigned long long) cputime, 2048000000) >> 1; } -static inline cputime_t -secs_to_cputime(const unsigned int s) +static inline cputime_t secs_to_cputime(const unsigned int s) { - return (cputime_t) s * 4096000000ULL; + return (__force cputime_t)(s * 4096000000ULL); } /* * Convert cputime to timespec and back. */ -static inline cputime_t -timespec_to_cputime(const struct timespec *value) +static inline cputime_t timespec_to_cputime(const struct timespec *value) { - return value->tv_nsec * 4096 / 1000 + (u64) value->tv_sec * 4096000000ULL; + unsigned long long ret = value->tv_sec * 4096000000ULL; + return (__force cputime_t)(ret + value->tv_nsec * 4096 / 1000); } -static inline void -cputime_to_timespec(const cputime_t cputime, struct timespec *value) +static inline void cputime_to_timespec(const cputime_t cputime, + struct timespec *value) { + unsigned long long __cputime = (__force unsigned long long) cputime; #ifndef __s390x__ register_pair rp; - rp.pair = cputime >> 1; + rp.pair = __cputime >> 1; asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL)); value->tv_nsec = rp.subreg.even * 1000 / 4096; value->tv_sec = rp.subreg.odd; #else - value->tv_nsec = (cputime % 4096000000ULL) * 1000 / 4096; - value->tv_sec = cputime / 4096000000ULL; + value->tv_nsec = (__cputime % 4096000000ULL) * 1000 / 4096; + value->tv_sec = __cputime / 4096000000ULL; #endif } @@ -134,50 +118,52 @@ cputime_to_timespec(const cputime_t cputime, struct timespec *value) * Since cputime and timeval have the same resolution (microseconds) * this is easy. */ -static inline cputime_t -timeval_to_cputime(const struct timeval *value) +static inline cputime_t timeval_to_cputime(const struct timeval *value) { - return value->tv_usec * 4096 + (u64) value->tv_sec * 4096000000ULL; + unsigned long long ret = value->tv_sec * 4096000000ULL; + return (__force cputime_t)(ret + value->tv_usec * 4096ULL); } -static inline void -cputime_to_timeval(const cputime_t cputime, struct timeval *value) +static inline void cputime_to_timeval(const cputime_t cputime, + struct timeval *value) { + unsigned long long __cputime = (__force unsigned long long) cputime; #ifndef __s390x__ register_pair rp; - rp.pair = cputime >> 1; + rp.pair = __cputime >> 1; asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL)); value->tv_usec = rp.subreg.even / 4096; value->tv_sec = rp.subreg.odd; #else - value->tv_usec = (cputime % 4096000000ULL) / 4096; - value->tv_sec = cputime / 4096000000ULL; + value->tv_usec = (__cputime % 4096000000ULL) / 4096; + value->tv_sec = __cputime / 4096000000ULL; #endif } /* * Convert cputime to clock and back. */ -static inline clock_t -cputime_to_clock_t(cputime_t cputime) +static inline clock_t cputime_to_clock_t(cputime_t cputime) { - return cputime_div(cputime, 4096000000ULL / USER_HZ); + unsigned long long clock = (__force unsigned long long) cputime; + do_div(clock, 4096000000ULL / USER_HZ); + return clock; } -static inline cputime_t -clock_t_to_cputime(unsigned long x) +static inline cputime_t clock_t_to_cputime(unsigned long x) { - return (cputime_t) x * (4096000000ULL / USER_HZ); + return (__force cputime_t)(x * (4096000000ULL / USER_HZ)); } /* * Convert cputime64 to clock. */ -static inline clock_t -cputime64_to_clock_t(cputime64_t cputime) +static inline clock_t cputime64_to_clock_t(cputime64_t cputime) { - return cputime_div(cputime, 4096000000ULL / USER_HZ); + unsigned long long clock = (__force unsigned long long) cputime; + do_div(clock, 4096000000ULL / USER_HZ); + return clock; } struct s390_idle_data { diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index e54c4ff8aba..f11d1b037c5 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -21,6 +21,7 @@ #include <linux/module.h> #include <linux/sched.h> #include <linux/kernel.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/stddef.h> #include <linux/unistd.h> @@ -820,7 +821,8 @@ setup_memory(void) end_chunk = min(end_chunk, end_pfn); if (start_chunk >= end_chunk) continue; - add_active_range(0, start_chunk, end_chunk); + memblock_add_node(PFN_PHYS(start_chunk), + PFN_PHYS(end_chunk - start_chunk), 0); pfn = max(start_chunk, start_pfn); for (; pfn < end_chunk; pfn++) page_set_storage_key(PFN_PHYS(pfn), diff --git a/arch/s390/oprofile/hwsampler.c b/arch/s390/oprofile/hwsampler.c index f43c0e4282a..9daee91e6c3 100644 --- a/arch/s390/oprofile/hwsampler.c +++ b/arch/s390/oprofile/hwsampler.c @@ -22,6 +22,7 @@ #include <asm/irq.h> #include "hwsampler.h" +#include "op_counter.h" #define MAX_NUM_SDB 511 #define MIN_NUM_SDB 1 @@ -896,6 +897,8 @@ static void add_samples_to_oprofile(unsigned int cpu, unsigned long *sdbt, if (sample_data_ptr->P == 1) { /* userspace sample */ unsigned int pid = sample_data_ptr->prim_asn; + if (!counter_config.user) + goto skip_sample; rcu_read_lock(); tsk = pid_task(find_vpid(pid), PIDTYPE_PID); if (tsk) @@ -903,6 +906,8 @@ static void add_samples_to_oprofile(unsigned int cpu, unsigned long *sdbt, rcu_read_unlock(); } else { /* kernelspace sample */ + if (!counter_config.kernel) + goto skip_sample; regs = task_pt_regs(current); } @@ -910,7 +915,7 @@ static void add_samples_to_oprofile(unsigned int cpu, unsigned long *sdbt, oprofile_add_ext_hw_sample(sample_data_ptr->ia, regs, 0, !sample_data_ptr->P, tsk); mutex_unlock(&hws_sem); - + skip_sample: sample_data_ptr++; } } diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c index bd58b72454c..2297be406c6 100644 --- a/arch/s390/oprofile/init.c +++ b/arch/s390/oprofile/init.c @@ -2,10 +2,11 @@ * arch/s390/oprofile/init.c * * S390 Version - * Copyright (C) 2003 IBM Deutschland Entwicklung GmbH, IBM Corporation + * Copyright (C) 2002-2011 IBM Deutschland Entwicklung GmbH, IBM Corporation * Author(s): Thomas Spatzier (tspat@de.ibm.com) * Author(s): Mahesh Salgaonkar (mahesh@linux.vnet.ibm.com) * Author(s): Heinz Graalfs (graalfs@linux.vnet.ibm.com) + * Author(s): Andreas Krebbel (krebbel@linux.vnet.ibm.com) * * @remark Copyright 2002-2011 OProfile authors */ @@ -14,6 +15,8 @@ #include <linux/init.h> #include <linux/errno.h> #include <linux/fs.h> +#include <linux/module.h> +#include <asm/processor.h> #include "../../../drivers/oprofile/oprof.h" @@ -22,6 +25,7 @@ extern void s390_backtrace(struct pt_regs * const regs, unsigned int depth); #ifdef CONFIG_64BIT #include "hwsampler.h" +#include "op_counter.h" #define DEFAULT_INTERVAL 4127518 @@ -35,16 +39,41 @@ static unsigned long oprofile_max_interval; static unsigned long oprofile_sdbt_blocks = DEFAULT_SDBT_BLOCKS; static unsigned long oprofile_sdb_blocks = DEFAULT_SDB_BLOCKS; -static int hwsampler_file; +static int hwsampler_enabled; static int hwsampler_running; /* start_mutex must be held to change */ +static int hwsampler_available; static struct oprofile_operations timer_ops; +struct op_counter_config counter_config; + +enum __force_cpu_type { + reserved = 0, /* do not force */ + timer, +}; +static int force_cpu_type; + +static int set_cpu_type(const char *str, struct kernel_param *kp) +{ + if (!strcmp(str, "timer")) { + force_cpu_type = timer; + printk(KERN_INFO "oprofile: forcing timer to be returned " + "as cpu type\n"); + } else { + force_cpu_type = 0; + } + + return 0; +} +module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0); +MODULE_PARM_DESC(cpu_type, "Force legacy basic mode sampling" + "(report cpu_type \"timer\""); + static int oprofile_hwsampler_start(void) { int retval; - hwsampler_running = hwsampler_file; + hwsampler_running = hwsampler_enabled; if (!hwsampler_running) return timer_ops.start(); @@ -72,10 +101,16 @@ static void oprofile_hwsampler_stop(void) return; } +/* + * File ops used for: + * /dev/oprofile/0/enabled + * /dev/oprofile/hwsampling/hwsampler (cpu_type = timer) + */ + static ssize_t hwsampler_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { - return oprofilefs_ulong_to_user(hwsampler_file, buf, count, offset); + return oprofilefs_ulong_to_user(hwsampler_enabled, buf, count, offset); } static ssize_t hwsampler_write(struct file *file, char const __user *buf, @@ -91,6 +126,9 @@ static ssize_t hwsampler_write(struct file *file, char const __user *buf, if (retval <= 0) return retval; + if (val != 0 && val != 1) + return -EINVAL; + if (oprofile_started) /* * save to do without locking as we set @@ -99,7 +137,7 @@ static ssize_t hwsampler_write(struct file *file, char const __user *buf, */ return -EBUSY; - hwsampler_file = val; + hwsampler_enabled = val; return count; } @@ -109,38 +147,311 @@ static const struct file_operations hwsampler_fops = { .write = hwsampler_write, }; +/* + * File ops used for: + * /dev/oprofile/0/count + * /dev/oprofile/hwsampling/hw_interval (cpu_type = timer) + * + * Make sure that the value is within the hardware range. + */ + +static ssize_t hw_interval_read(struct file *file, char __user *buf, + size_t count, loff_t *offset) +{ + return oprofilefs_ulong_to_user(oprofile_hw_interval, buf, + count, offset); +} + +static ssize_t hw_interval_write(struct file *file, char const __user *buf, + size_t count, loff_t *offset) +{ + unsigned long val; + int retval; + + if (*offset) + return -EINVAL; + retval = oprofilefs_ulong_from_user(&val, buf, count); + if (retval) + return retval; + if (val < oprofile_min_interval) + oprofile_hw_interval = oprofile_min_interval; + else if (val > oprofile_max_interval) + oprofile_hw_interval = oprofile_max_interval; + else + oprofile_hw_interval = val; + + return count; +} + +static const struct file_operations hw_interval_fops = { + .read = hw_interval_read, + .write = hw_interval_write, +}; + +/* + * File ops used for: + * /dev/oprofile/0/event + * Only a single event with number 0 is supported with this counter. + * + * /dev/oprofile/0/unit_mask + * This is a dummy file needed by the user space tools. + * No value other than 0 is accepted or returned. + */ + +static ssize_t hwsampler_zero_read(struct file *file, char __user *buf, + size_t count, loff_t *offset) +{ + return oprofilefs_ulong_to_user(0, buf, count, offset); +} + +static ssize_t hwsampler_zero_write(struct file *file, char const __user *buf, + size_t count, loff_t *offset) +{ + unsigned long val; + int retval; + + if (*offset) + return -EINVAL; + + retval = oprofilefs_ulong_from_user(&val, buf, count); + if (retval) + return retval; + if (val != 0) + return -EINVAL; + return count; +} + +static const struct file_operations zero_fops = { + .read = hwsampler_zero_read, + .write = hwsampler_zero_write, +}; + +/* /dev/oprofile/0/kernel file ops. */ + +static ssize_t hwsampler_kernel_read(struct file *file, char __user *buf, + size_t count, loff_t *offset) +{ + return oprofilefs_ulong_to_user(counter_config.kernel, + buf, count, offset); +} + +static ssize_t hwsampler_kernel_write(struct file *file, char const __user *buf, + size_t count, loff_t *offset) +{ + unsigned long val; + int retval; + + if (*offset) + return -EINVAL; + + retval = oprofilefs_ulong_from_user(&val, buf, count); + if (retval) + return retval; + + if (val != 0 && val != 1) + return -EINVAL; + + counter_config.kernel = val; + + return count; +} + +static const struct file_operations kernel_fops = { + .read = hwsampler_kernel_read, + .write = hwsampler_kernel_write, +}; + +/* /dev/oprofile/0/user file ops. */ + +static ssize_t hwsampler_user_read(struct file *file, char __user *buf, + size_t count, loff_t *offset) +{ + return oprofilefs_ulong_to_user(counter_config.user, + buf, count, offset); +} + +static ssize_t hwsampler_user_write(struct file *file, char const __user *buf, + size_t count, loff_t *offset) +{ + unsigned long val; + int retval; + + if (*offset) + return -EINVAL; + + retval = oprofilefs_ulong_from_user(&val, buf, count); + if (retval) + return retval; + + if (val != 0 && val != 1) + return -EINVAL; + + counter_config.user = val; + + return count; +} + +static const struct file_operations user_fops = { + .read = hwsampler_user_read, + .write = hwsampler_user_write, +}; + + +/* + * File ops used for: /dev/oprofile/timer/enabled + * The value always has to be the inverted value of hwsampler_enabled. So + * no separate variable is created. That way we do not need locking. + */ + +static ssize_t timer_enabled_read(struct file *file, char __user *buf, + size_t count, loff_t *offset) +{ + return oprofilefs_ulong_to_user(!hwsampler_enabled, buf, count, offset); +} + +static ssize_t timer_enabled_write(struct file *file, char const __user *buf, + size_t count, loff_t *offset) +{ + unsigned long val; + int retval; + + if (*offset) + return -EINVAL; + + retval = oprofilefs_ulong_from_user(&val, buf, count); + if (retval) + return retval; + + if (val != 0 && val != 1) + return -EINVAL; + + /* Timer cannot be disabled without having hardware sampling. */ + if (val == 0 && !hwsampler_available) + return -EINVAL; + + if (oprofile_started) + /* + * save to do without locking as we set + * hwsampler_running in start() when start_mutex is + * held + */ + return -EBUSY; + + hwsampler_enabled = !val; + + return count; +} + +static const struct file_operations timer_enabled_fops = { + .read = timer_enabled_read, + .write = timer_enabled_write, +}; + + static int oprofile_create_hwsampling_files(struct super_block *sb, - struct dentry *root) + struct dentry *root) { - struct dentry *hw_dir; + struct dentry *dir; + + dir = oprofilefs_mkdir(sb, root, "timer"); + if (!dir) + return -EINVAL; + + oprofilefs_create_file(sb, dir, "enabled", &timer_enabled_fops); + + if (!hwsampler_available) + return 0; /* reinitialize default values */ - hwsampler_file = 1; + hwsampler_enabled = 1; + counter_config.kernel = 1; + counter_config.user = 1; - hw_dir = oprofilefs_mkdir(sb, root, "hwsampling"); - if (!hw_dir) - return -EINVAL; + if (!force_cpu_type) { + /* + * Create the counter file system. A single virtual + * counter is created which can be used to + * enable/disable hardware sampling dynamically from + * user space. The user space will configure a single + * counter with a single event. The value of 'event' + * and 'unit_mask' are not evaluated by the kernel code + * and can only be set to 0. + */ + + dir = oprofilefs_mkdir(sb, root, "0"); + if (!dir) + return -EINVAL; - oprofilefs_create_file(sb, hw_dir, "hwsampler", &hwsampler_fops); - oprofilefs_create_ulong(sb, hw_dir, "hw_interval", - &oprofile_hw_interval); - oprofilefs_create_ro_ulong(sb, hw_dir, "hw_min_interval", - &oprofile_min_interval); - oprofilefs_create_ro_ulong(sb, hw_dir, "hw_max_interval", - &oprofile_max_interval); - oprofilefs_create_ulong(sb, hw_dir, "hw_sdbt_blocks", - &oprofile_sdbt_blocks); + oprofilefs_create_file(sb, dir, "enabled", &hwsampler_fops); + oprofilefs_create_file(sb, dir, "event", &zero_fops); + oprofilefs_create_file(sb, dir, "count", &hw_interval_fops); + oprofilefs_create_file(sb, dir, "unit_mask", &zero_fops); + oprofilefs_create_file(sb, dir, "kernel", &kernel_fops); + oprofilefs_create_file(sb, dir, "user", &user_fops); + oprofilefs_create_ulong(sb, dir, "hw_sdbt_blocks", + &oprofile_sdbt_blocks); + } else { + /* + * Hardware sampling can be used but the cpu_type is + * forced to timer in order to deal with legacy user + * space tools. The /dev/oprofile/hwsampling fs is + * provided in that case. + */ + dir = oprofilefs_mkdir(sb, root, "hwsampling"); + if (!dir) + return -EINVAL; + + oprofilefs_create_file(sb, dir, "hwsampler", + &hwsampler_fops); + oprofilefs_create_file(sb, dir, "hw_interval", + &hw_interval_fops); + oprofilefs_create_ro_ulong(sb, dir, "hw_min_interval", + &oprofile_min_interval); + oprofilefs_create_ro_ulong(sb, dir, "hw_max_interval", + &oprofile_max_interval); + oprofilefs_create_ulong(sb, dir, "hw_sdbt_blocks", + &oprofile_sdbt_blocks); + } return 0; } static int oprofile_hwsampler_init(struct oprofile_operations *ops) { + /* + * Initialize the timer mode infrastructure as well in order + * to be able to switch back dynamically. oprofile_timer_init + * is not supposed to fail. + */ + if (oprofile_timer_init(ops)) + BUG(); + + memcpy(&timer_ops, ops, sizeof(timer_ops)); + ops->create_files = oprofile_create_hwsampling_files; + + /* + * If the user space tools do not support newer cpu types, + * the force_cpu_type module parameter + * can be used to always return \"timer\" as cpu type. + */ + if (force_cpu_type != timer) { + struct cpuid id; + + get_cpu_id (&id); + + switch (id.machine) { + case 0x2097: case 0x2098: ops->cpu_type = "s390/z10"; break; + case 0x2817: case 0x2818: ops->cpu_type = "s390/z196"; break; + default: return -ENODEV; + } + } + if (hwsampler_setup()) return -ENODEV; /* - * create hwsampler files only if hwsampler_setup() succeeds. + * Query the range for the sampling interval from the + * hardware. */ oprofile_min_interval = hwsampler_query_min_interval(); if (oprofile_min_interval == 0) @@ -155,23 +466,17 @@ static int oprofile_hwsampler_init(struct oprofile_operations *ops) if (oprofile_hw_interval > oprofile_max_interval) oprofile_hw_interval = oprofile_max_interval; - if (oprofile_timer_init(ops)) - return -ENODEV; - - printk(KERN_INFO "oprofile: using hardware sampling\n"); - - memcpy(&timer_ops, ops, sizeof(timer_ops)); + printk(KERN_INFO "oprofile: System z hardware sampling " + "facility found.\n"); ops->start = oprofile_hwsampler_start; ops->stop = oprofile_hwsampler_stop; - ops->create_files = oprofile_create_hwsampling_files; return 0; } static void oprofile_hwsampler_exit(void) { - oprofile_timer_exit(); hwsampler_shutdown(); } @@ -182,7 +487,15 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) ops->backtrace = s390_backtrace; #ifdef CONFIG_64BIT - return oprofile_hwsampler_init(ops); + + /* + * -ENODEV is not reported to the caller. The module itself + * will use the timer mode sampling as fallback and this is + * always available. + */ + hwsampler_available = oprofile_hwsampler_init(ops) == 0; + + return 0; #else return -ENODEV; #endif diff --git a/arch/s390/oprofile/op_counter.h b/arch/s390/oprofile/op_counter.h new file mode 100644 index 00000000000..1a8d3ca0901 --- /dev/null +++ b/arch/s390/oprofile/op_counter.h @@ -0,0 +1,23 @@ +/** + * arch/s390/oprofile/op_counter.h + * + * Copyright (C) 2011 IBM Deutschland Entwicklung GmbH, IBM Corporation + * Author(s): Andreas Krebbel (krebbel@linux.vnet.ibm.com) + * + * @remark Copyright 2011 OProfile authors + */ + +#ifndef OP_COUNTER_H +#define OP_COUNTER_H + +struct op_counter_config { + /* `enabled' maps to the hwsampler_file variable. */ + /* `count' maps to the oprofile_hw_interval variable. */ + /* `event' and `unit_mask' are unused. */ + unsigned long kernel; + unsigned long user; +}; + +extern struct op_counter_config counter_config; + +#endif /* OP_COUNTER_H */ diff --git a/arch/score/Kconfig b/arch/score/Kconfig index df169e84db4..8b0c9464aa9 100644 --- a/arch/score/Kconfig +++ b/arch/score/Kconfig @@ -4,6 +4,9 @@ config SCORE def_bool y select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW + select HAVE_MEMBLOCK + select HAVE_MEMBLOCK_NODE_MAP + select ARCH_DISCARD_MEMBLOCK choice prompt "System type" @@ -60,9 +63,6 @@ config 32BIT config ARCH_FLATMEM_ENABLE def_bool y -config ARCH_POPULATES_NODE_MAP - def_bool y - source "mm/Kconfig" config MEMORY_START diff --git a/arch/score/kernel/setup.c b/arch/score/kernel/setup.c index 6f898c05787..b48459afefd 100644 --- a/arch/score/kernel/setup.c +++ b/arch/score/kernel/setup.c @@ -26,6 +26,7 @@ #include <linux/bootmem.h> #include <linux/initrd.h> #include <linux/ioport.h> +#include <linux/memblock.h> #include <linux/mm.h> #include <linux/seq_file.h> #include <linux/screen_info.h> @@ -54,7 +55,8 @@ static void __init bootmem_init(void) /* Initialize the boot-time allocator with low memory only. */ bootmap_size = init_bootmem_node(NODE_DATA(0), start_pfn, min_low_pfn, max_low_pfn); - add_active_range(0, min_low_pfn, max_low_pfn); + memblock_add_node(PFN_PHYS(min_low_pfn), + PFN_PHYS(max_low_pfn - min_low_pfn), 0); free_bootmem(PFN_PHYS(start_pfn), (max_low_pfn - start_pfn) << PAGE_SHIFT); diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 5629e209913..47a2f1c2cb0 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -4,6 +4,7 @@ config SUPERH select CLKDEV_LOOKUP select HAVE_IDE if HAS_IOPORT select HAVE_MEMBLOCK + select HAVE_MEMBLOCK_NODE_MAP select HAVE_OPROFILE select HAVE_GENERIC_DMA_COHERENT select HAVE_ARCH_TRACEHOOK diff --git a/arch/sh/include/asm/memblock.h b/arch/sh/include/asm/memblock.h deleted file mode 100644 index e87063fad2e..00000000000 --- a/arch/sh/include/asm/memblock.h +++ /dev/null @@ -1,4 +0,0 @@ -#ifndef __ASM_SH_MEMBLOCK_H -#define __ASM_SH_MEMBLOCK_H - -#endif /* __ASM_SH_MEMBLOCK_H */ diff --git a/arch/sh/kernel/machine_kexec.c b/arch/sh/kernel/machine_kexec.c index c5a33f007f8..9fea49f6e66 100644 --- a/arch/sh/kernel/machine_kexec.c +++ b/arch/sh/kernel/machine_kexec.c @@ -157,9 +157,6 @@ void __init reserve_crashkernel(void) unsigned long long crash_size, crash_base; int ret; - /* this is necessary because of memblock_phys_mem_size() */ - memblock_analyze(); - ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &crash_size, &crash_base); if (ret == 0 && crash_size > 0) { diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index 1a0e946679a..7b57bf1dc85 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -230,7 +230,8 @@ void __init __add_active_range(unsigned int nid, unsigned long start_pfn, pmb_bolt_mapping((unsigned long)__va(start), start, end - start, PAGE_KERNEL); - add_active_range(nid, start_pfn, end_pfn); + memblock_set_node(PFN_PHYS(start_pfn), + PFN_PHYS(end_pfn - start_pfn), nid); } void __init __weak plat_early_device_setup(void) diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index c3e61b36649..cb8f9920f4d 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -143,9 +143,6 @@ config MAX_ACTIVE_REGIONS CPU_SUBTYPE_SH7785) default "1" -config ARCH_POPULATES_NODE_MAP - def_bool y - config ARCH_SELECT_MEMORY_MODEL def_bool y diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 939ca0f356f..82cc576fab1 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -324,7 +324,6 @@ void __init paging_init(void) unsigned long vaddr, end; int nid; - memblock_init(); sh_mv.mv_mem_init(); early_reserve_mem(); @@ -337,7 +336,7 @@ void __init paging_init(void) sh_mv.mv_mem_reserve(); memblock_enforce_memory_limit(memory_limit); - memblock_analyze(); + memblock_allow_resize(); memblock_dump_all(); diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index f92602e8660..70ae9d81870 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -43,6 +43,7 @@ config SPARC64 select HAVE_KPROBES select HAVE_RCU_TABLE_FREE if SMP select HAVE_MEMBLOCK + select HAVE_MEMBLOCK_NODE_MAP select HAVE_SYSCALL_WRAPPERS select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD @@ -352,9 +353,6 @@ config NODES_SPAN_OTHER_NODES def_bool y depends on NEED_MULTIPLE_NODES -config ARCH_POPULATES_NODE_MAP - def_bool y if SPARC64 - config ARCH_SELECT_MEMORY_MODEL def_bool y if SPARC64 diff --git a/arch/sparc/include/asm/memblock.h b/arch/sparc/include/asm/memblock.h deleted file mode 100644 index c67b047ef85..00000000000 --- a/arch/sparc/include/asm/memblock.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _SPARC64_MEMBLOCK_H -#define _SPARC64_MEMBLOCK_H - -#include <asm/oplib.h> - -#define MEMBLOCK_DBG(fmt...) prom_printf(fmt) - -#endif /* !(_SPARC64_MEMBLOCK_H) */ diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 8e073d80213..b3f5e7dfea5 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -790,7 +790,7 @@ static int find_node(unsigned long addr) return -1; } -u64 memblock_nid_range(u64 start, u64 end, int *nid) +static u64 memblock_nid_range(u64 start, u64 end, int *nid) { *nid = find_node(start); start += PAGE_SIZE; @@ -808,7 +808,7 @@ u64 memblock_nid_range(u64 start, u64 end, int *nid) return start; } #else -u64 memblock_nid_range(u64 start, u64 end, int *nid) +static u64 memblock_nid_range(u64 start, u64 end, int *nid) { *nid = 0; return end; @@ -816,7 +816,7 @@ u64 memblock_nid_range(u64 start, u64 end, int *nid) #endif /* This must be invoked after performing all of the necessary - * add_active_range() calls for 'nid'. We need to be able to get + * memblock_set_node() calls for 'nid'. We need to be able to get * correct data from get_pfn_range_for_nid(). */ static void __init allocate_node_data(int nid) @@ -987,14 +987,11 @@ static void __init add_node_ranges(void) this_end = memblock_nid_range(start, end, &nid); - numadbg("Adding active range nid[%d] " + numadbg("Setting memblock NUMA node nid[%d] " "start[%lx] end[%lx]\n", nid, start, this_end); - add_active_range(nid, - start >> PAGE_SHIFT, - this_end >> PAGE_SHIFT); - + memblock_set_node(start, this_end - start, nid); start = this_end; } } @@ -1282,7 +1279,6 @@ static void __init bootmem_init_nonnuma(void) { unsigned long top_of_ram = memblock_end_of_DRAM(); unsigned long total_ram = memblock_phys_mem_size(); - struct memblock_region *reg; numadbg("bootmem_init_nonnuma()\n"); @@ -1292,20 +1288,8 @@ static void __init bootmem_init_nonnuma(void) (top_of_ram - total_ram) >> 20); init_node_masks_nonnuma(); - - for_each_memblock(memory, reg) { - unsigned long start_pfn, end_pfn; - - if (!reg->size) - continue; - - start_pfn = memblock_region_memory_base_pfn(reg); - end_pfn = memblock_region_memory_end_pfn(reg); - add_active_range(0, start_pfn, end_pfn); - } - + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); allocate_node_data(0); - node_set_online(0); } @@ -1769,8 +1753,6 @@ void __init paging_init(void) sun4v_ktsb_init(); } - memblock_init(); - /* Find available physical memory... * * Read it twice in order to work around a bug in openfirmware. @@ -1796,7 +1778,7 @@ void __init paging_init(void) memblock_enforce_memory_limit(cmdline_memory_size); - memblock_analyze(); + memblock_allow_resize(); memblock_dump_all(); set_bit(0, mmu_context_bmap); diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c index a08d9fab81f..82a6e22f1f3 100644 --- a/arch/um/kernel/time.c +++ b/arch/um/kernel/time.c @@ -75,8 +75,6 @@ static struct clocksource itimer_clocksource = { .rating = 300, .read = itimer_read, .mask = CLOCKSOURCE_MASK(64), - .mult = 1000, - .shift = 0, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -94,9 +92,9 @@ static void __init setup_itimer(void) clockevent_delta2ns(60 * HZ, &itimer_clockevent); itimer_clockevent.min_delta_ns = clockevent_delta2ns(1, &itimer_clockevent); - err = clocksource_register(&itimer_clocksource); + err = clocksource_register_hz(&itimer_clocksource, USEC_PER_SEC); if (err) { - printk(KERN_ERR "clocksource_register returned %d\n", err); + printk(KERN_ERR "clocksource_register_hz returned %d\n", err); return; } clockevents_register_device(&itimer_clockevent); diff --git a/arch/unicore32/kernel/setup.c b/arch/unicore32/kernel/setup.c index 471b6bca8da..673d7a89d8f 100644 --- a/arch/unicore32/kernel/setup.c +++ b/arch/unicore32/kernel/setup.c @@ -37,6 +37,7 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/traps.h> +#include <asm/memblock.h> #include "setup.h" diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c index 3b379cddbc6..de186bde897 100644 --- a/arch/unicore32/mm/init.c +++ b/arch/unicore32/mm/init.c @@ -26,6 +26,7 @@ #include <asm/setup.h> #include <asm/sizes.h> #include <asm/tlb.h> +#include <asm/memblock.h> #include <mach/map.h> #include "mm.h" @@ -245,7 +246,6 @@ void __init uc32_memblock_init(struct meminfo *mi) sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]), meminfo_cmp, NULL); - memblock_init(); for (i = 0; i < mi->nr_banks; i++) memblock_add(mi->bank[i].start, mi->bank[i].size); @@ -264,7 +264,7 @@ void __init uc32_memblock_init(struct meminfo *mi) uc32_mm_memblock_reserve(); - memblock_analyze(); + memblock_allow_resize(); memblock_dump_all(); } diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c index 3e5c3e5a0b4..43c20b40e44 100644 --- a/arch/unicore32/mm/mmu.c +++ b/arch/unicore32/mm/mmu.c @@ -25,6 +25,7 @@ #include <asm/setup.h> #include <asm/sizes.h> #include <asm/tlb.h> +#include <asm/memblock.h> #include <mach/map.h> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index efb42949cc0..5731eb70e0a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -26,6 +26,8 @@ config X86 select HAVE_IOREMAP_PROT select HAVE_KPROBES select HAVE_MEMBLOCK + select HAVE_MEMBLOCK_NODE_MAP + select ARCH_DISCARD_MEMBLOCK select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS @@ -204,9 +206,6 @@ config ZONE_DMA32 bool default X86_64 -config ARCH_POPULATES_NODE_MAP - def_bool y - config AUDIT_ARCH bool default X86_64 @@ -343,6 +342,7 @@ config X86_EXTENDED_PLATFORM If you enable this option then you'll be able to select support for the following (non-PC) 64 bit x86 platforms: + Numascale NumaChip ScaleMP vSMP SGI Ultraviolet @@ -351,6 +351,18 @@ config X86_EXTENDED_PLATFORM endif # This is an alphabetically sorted list of 64 bit extended platforms # Please maintain the alphabetic order if and when there are additions +config X86_NUMACHIP + bool "Numascale NumaChip" + depends on X86_64 + depends on X86_EXTENDED_PLATFORM + depends on NUMA + depends on SMP + depends on X86_X2APIC + depends on !EDAC_AMD64 + ---help--- + Adds support for Numascale NumaChip large-SMP systems. Needed to + enable more than ~168 cores. + If you don't have one of these, you should say N here. config X86_VSMP bool "ScaleMP vSMP" diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a6253ec1b28..3e274564f6b 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -134,7 +134,7 @@ ENTRY(ia32_sysenter_target) CFI_REL_OFFSET rsp,0 pushfq_cfi /*CFI_REL_OFFSET rflags,0*/ - movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d + movl TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d CFI_REGISTER rip,r10 pushq_cfi $__USER32_CS /*CFI_REL_OFFSET cs,0*/ @@ -150,9 +150,8 @@ ENTRY(ia32_sysenter_target) .section __ex_table,"a" .quad 1b,ia32_badarg .previous - GET_THREAD_INFO(%r10) - orl $TS_COMPAT,TI_status(%r10) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) CFI_REMEMBER_STATE jnz sysenter_tracesys cmpq $(IA32_NR_syscalls-1),%rax @@ -162,13 +161,12 @@ sysenter_do_call: sysenter_dispatch: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) - GET_THREAD_INFO(%r10) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags(%r10) + testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz sysexit_audit sysexit_from_sys_call: - andl $~TS_COMPAT,TI_status(%r10) + andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) /* clear IF, that popfq doesn't enable interrupts early */ andl $~0x200,EFLAGS-R11(%rsp) movl RIP-R11(%rsp),%edx /* User %eip */ @@ -205,7 +203,7 @@ sysexit_from_sys_call: .endm .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz ia32_ret_from_sys_call TRACE_IRQS_ON sti @@ -215,12 +213,11 @@ sysexit_from_sys_call: movzbl %al,%edi /* zero-extend that into %edi */ inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ call audit_syscall_exit - GET_THREAD_INFO(%r10) movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi cli TRACE_IRQS_OFF - testl %edi,TI_flags(%r10) + testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jz \exit CLEAR_RREGS -ARGOFFSET jmp int_with_check @@ -238,7 +235,7 @@ sysexit_audit: sysenter_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jz sysenter_auditsys #endif SAVE_REST @@ -309,9 +306,8 @@ ENTRY(ia32_cstar_target) .section __ex_table,"a" .quad 1b,ia32_badarg .previous - GET_THREAD_INFO(%r10) - orl $TS_COMPAT,TI_status(%r10) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) CFI_REMEMBER_STATE jnz cstar_tracesys cmpq $IA32_NR_syscalls-1,%rax @@ -321,13 +317,12 @@ cstar_do_call: cstar_dispatch: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) - GET_THREAD_INFO(%r10) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags(%r10) + testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz sysretl_audit sysretl_from_sys_call: - andl $~TS_COMPAT,TI_status(%r10) + andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) RESTORE_ARGS 0,-ARG_SKIP,0,0,0 movl RIP-ARGOFFSET(%rsp),%ecx CFI_REGISTER rip,rcx @@ -355,7 +350,7 @@ sysretl_audit: cstar_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jz cstar_auditsys #endif xchgl %r9d,%ebp @@ -420,9 +415,8 @@ ENTRY(ia32_syscall) /* note the registers are not zero extended to the sf. this could be a problem. */ SAVE_ARGS 0,1,0 - GET_THREAD_INFO(%r10) - orl $TS_COMPAT,TI_status(%r10) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz ia32_tracesys cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys @@ -459,8 +453,8 @@ quiet_ni_syscall: CFI_ENDPROC .macro PTREGSCALL label, func, arg - .globl \label -\label: + ALIGN +GLOBAL(\label) leaq \func(%rip),%rax leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ jmp ia32_ptregs_common @@ -477,7 +471,8 @@ quiet_ni_syscall: PTREGSCALL stub32_vfork, sys_vfork, %rdi PTREGSCALL stub32_iopl, sys_iopl, %rsi -ENTRY(ia32_ptregs_common) + ALIGN +ia32_ptregs_common: popq %r11 CFI_ENDPROC CFI_STARTPROC32 simple diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 091508b533b..952bd0100c5 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -4,10 +4,10 @@ #ifdef CONFIG_SMP .macro LOCK_PREFIX -1: lock +672: lock .section .smp_locks,"a" .balign 4 - .long 1b - . + .long 672b - . .previous .endm #else diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 1a6c09af048..3ab9bdd87e7 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -176,6 +176,7 @@ static inline u64 native_x2apic_icr_read(void) } extern int x2apic_phys; +extern int x2apic_preenabled; extern void check_x2apic(void); extern void enable_x2apic(void); extern void x2apic_icr_write(u32 low, u32 id); @@ -198,6 +199,9 @@ static inline void x2apic_force_phys(void) x2apic_phys = 1; } #else +static inline void disable_x2apic(void) +{ +} static inline void check_x2apic(void) { } @@ -212,6 +216,7 @@ static inline void x2apic_force_phys(void) { } +#define nox2apic 0 #define x2apic_preenabled 0 #define x2apic_supported() 0 #endif @@ -410,6 +415,7 @@ extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); #endif #ifdef CONFIG_X86_LOCAL_APIC + static inline u32 apic_read(u32 reg) { return apic->read(reg); diff --git a/arch/x86/include/asm/apic_flat_64.h b/arch/x86/include/asm/apic_flat_64.h new file mode 100644 index 00000000000..a2d31279644 --- /dev/null +++ b/arch/x86/include/asm/apic_flat_64.h @@ -0,0 +1,7 @@ +#ifndef _ASM_X86_APIC_FLAT_64_H +#define _ASM_X86_APIC_FLAT_64_H + +extern void flat_init_apic_ldr(void); + +#endif + diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 3925d800786..134bba00df0 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -144,6 +144,7 @@ #define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) #define APIC_BASE_MSR 0x800 +#define XAPIC_ENABLE (1UL << 11) #define X2APIC_ENABLE (1UL << 10) #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 1775d6e5920..b97596e2b68 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -380,6 +380,8 @@ static inline unsigned long __fls(unsigned long word) return word; } +#undef ADDR + #ifdef __KERNEL__ /** * ffs - find first set bit in word @@ -395,10 +397,25 @@ static inline unsigned long __fls(unsigned long word) static inline int ffs(int x) { int r; -#ifdef CONFIG_X86_CMOV + +#ifdef CONFIG_X86_64 + /* + * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the + * dest reg is undefined if x==0, but their CPU architect says its + * value is written to set it to the same as before, except that the + * top 32 bits will be cleared. + * + * We cannot do this on 32 bits because at the very least some + * 486 CPUs did not behave this way. + */ + long tmp = -1; + asm("bsfl %1,%0" + : "=r" (r) + : "rm" (x), "0" (tmp)); +#elif defined(CONFIG_X86_CMOV) asm("bsfl %1,%0\n\t" "cmovzl %2,%0" - : "=r" (r) : "rm" (x), "r" (-1)); + : "=&r" (r) : "rm" (x), "r" (-1)); #else asm("bsfl %1,%0\n\t" "jnz 1f\n\t" @@ -422,7 +439,22 @@ static inline int ffs(int x) static inline int fls(int x) { int r; -#ifdef CONFIG_X86_CMOV + +#ifdef CONFIG_X86_64 + /* + * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the + * dest reg is undefined if x==0, but their CPU architect says its + * value is written to set it to the same as before, except that the + * top 32 bits will be cleared. + * + * We cannot do this on 32 bits because at the very least some + * 486 CPUs did not behave this way. + */ + long tmp = -1; + asm("bsrl %1,%0" + : "=r" (r) + : "rm" (x), "0" (tmp)); +#elif defined(CONFIG_X86_CMOV) asm("bsrl %1,%0\n\t" "cmovzl %2,%0" : "=&r" (r) : "rm" (x), "rm" (-1)); @@ -434,11 +466,35 @@ static inline int fls(int x) #endif return r + 1; } -#endif /* __KERNEL__ */ - -#undef ADDR -#ifdef __KERNEL__ +/** + * fls64 - find last set bit in a 64-bit word + * @x: the word to search + * + * This is defined in a similar way as the libc and compiler builtin + * ffsll, but returns the position of the most significant set bit. + * + * fls64(value) returns 0 if value is 0 or the position of the last + * set bit if value is nonzero. The last (most significant) bit is + * at position 64. + */ +#ifdef CONFIG_X86_64 +static __always_inline int fls64(__u64 x) +{ + long bitpos = -1; + /* + * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the + * dest reg is undefined if x==0, but their CPU architect says its + * value is written to set it to the same as before. + */ + asm("bsrq %1,%0" + : "+r" (bitpos) + : "rm" (x)); + return bitpos + 1; +} +#else +#include <asm-generic/bitops/fls64.h> +#endif #include <asm-generic/bitops/find.h> @@ -450,12 +506,6 @@ static inline int fls(int x) #include <asm-generic/bitops/const_hweight.h> -#endif /* __KERNEL__ */ - -#include <asm-generic/bitops/fls64.h> - -#ifdef __KERNEL__ - #include <asm-generic/bitops/le.h> #include <asm-generic/bitops/ext2-atomic-setbit.h> diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 5d3acdf5a7a..0c9fa2745f1 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -14,6 +14,8 @@ extern void __cmpxchg_wrong_size(void) __compiletime_error("Bad argument size for cmpxchg"); extern void __xadd_wrong_size(void) __compiletime_error("Bad argument size for xadd"); +extern void __add_wrong_size(void) + __compiletime_error("Bad argument size for add"); /* * Constants for operation sizes. On 32-bit, the 64-bit size it set to @@ -31,60 +33,47 @@ extern void __xadd_wrong_size(void) #define __X86_CASE_Q -1 /* sizeof will never return -1 */ #endif +/* + * An exchange-type operation, which takes a value and a pointer, and + * returns a the old value. + */ +#define __xchg_op(ptr, arg, op, lock) \ + ({ \ + __typeof__ (*(ptr)) __ret = (arg); \ + switch (sizeof(*(ptr))) { \ + case __X86_CASE_B: \ + asm volatile (lock #op "b %b0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_W: \ + asm volatile (lock #op "w %w0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_L: \ + asm volatile (lock #op "l %0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_Q: \ + asm volatile (lock #op "q %q0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + default: \ + __ ## op ## _wrong_size(); \ + } \ + __ret; \ + }) + /* * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. * Since this is generally used to protect other memory information, we * use "asm volatile" and "memory" clobbers to prevent gcc from moving * information around. */ -#define __xchg(x, ptr, size) \ -({ \ - __typeof(*(ptr)) __x = (x); \ - switch (size) { \ - case __X86_CASE_B: \ - { \ - volatile u8 *__ptr = (volatile u8 *)(ptr); \ - asm volatile("xchgb %0,%1" \ - : "=q" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case __X86_CASE_W: \ - { \ - volatile u16 *__ptr = (volatile u16 *)(ptr); \ - asm volatile("xchgw %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case __X86_CASE_L: \ - { \ - volatile u32 *__ptr = (volatile u32 *)(ptr); \ - asm volatile("xchgl %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case __X86_CASE_Q: \ - { \ - volatile u64 *__ptr = (volatile u64 *)(ptr); \ - asm volatile("xchgq %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - default: \ - __xchg_wrong_size(); \ - } \ - __x; \ -}) - -#define xchg(ptr, v) \ - __xchg((v), (ptr), sizeof(*ptr)) +#define xchg(ptr, v) __xchg_op((ptr), (v), xchg, "") /* * Atomic compare and exchange. Compare OLD with MEM, if identical, @@ -165,46 +154,80 @@ extern void __xadd_wrong_size(void) __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) #endif -#define __xadd(ptr, inc, lock) \ +/* + * xadd() adds "inc" to "*ptr" and atomically returns the previous + * value of "*ptr". + * + * xadd() is locked when multiple CPUs are online + * xadd_sync() is always locked + * xadd_local() is never locked + */ +#define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock) +#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX) +#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ") +#define xadd_local(ptr, inc) __xadd((ptr), (inc), "") + +#define __add(ptr, inc, lock) \ ({ \ __typeof__ (*(ptr)) __ret = (inc); \ switch (sizeof(*(ptr))) { \ case __X86_CASE_B: \ - asm volatile (lock "xaddb %b0, %1\n" \ - : "+r" (__ret), "+m" (*(ptr)) \ - : : "memory", "cc"); \ + asm volatile (lock "addb %b1, %0\n" \ + : "+m" (*(ptr)) : "ri" (inc) \ + : "memory", "cc"); \ break; \ case __X86_CASE_W: \ - asm volatile (lock "xaddw %w0, %1\n" \ - : "+r" (__ret), "+m" (*(ptr)) \ - : : "memory", "cc"); \ + asm volatile (lock "addw %w1, %0\n" \ + : "+m" (*(ptr)) : "ri" (inc) \ + : "memory", "cc"); \ break; \ case __X86_CASE_L: \ - asm volatile (lock "xaddl %0, %1\n" \ - : "+r" (__ret), "+m" (*(ptr)) \ - : : "memory", "cc"); \ + asm volatile (lock "addl %1, %0\n" \ + : "+m" (*(ptr)) : "ri" (inc) \ + : "memory", "cc"); \ break; \ case __X86_CASE_Q: \ - asm volatile (lock "xaddq %q0, %1\n" \ - : "+r" (__ret), "+m" (*(ptr)) \ - : : "memory", "cc"); \ + asm volatile (lock "addq %1, %0\n" \ + : "+m" (*(ptr)) : "ri" (inc) \ + : "memory", "cc"); \ break; \ default: \ - __xadd_wrong_size(); \ + __add_wrong_size(); \ } \ __ret; \ }) /* - * xadd() adds "inc" to "*ptr" and atomically returns the previous - * value of "*ptr". + * add_*() adds "inc" to "*ptr" * - * xadd() is locked when multiple CPUs are online - * xadd_sync() is always locked - * xadd_local() is never locked + * __add() takes a lock prefix + * add_smp() is locked when multiple CPUs are online + * add_sync() is always locked */ -#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX) -#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ") -#define xadd_local(ptr, inc) __xadd((ptr), (inc), "") +#define add_smp(ptr, inc) __add((ptr), (inc), LOCK_PREFIX) +#define add_sync(ptr, inc) __add((ptr), (inc), "lock; ") + +#define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2) \ +({ \ + bool __ret; \ + __typeof__(*(p1)) __old1 = (o1), __new1 = (n1); \ + __typeof__(*(p2)) __old2 = (o2), __new2 = (n2); \ + BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long)); \ + BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \ + VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long))); \ + VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2)); \ + asm volatile(pfx "cmpxchg%c4b %2; sete %0" \ + : "=a" (__ret), "+d" (__old2), \ + "+m" (*(p1)), "+m" (*(p2)) \ + : "i" (2 * sizeof(long)), "a" (__old1), \ + "b" (__new1), "c" (__new2)); \ + __ret; \ +}) + +#define cmpxchg_double(p1, p2, o1, o2, n1, n2) \ + __cmpxchg_double(LOCK_PREFIX, p1, p2, o1, o2, n1, n2) + +#define cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \ + __cmpxchg_double(, p1, p2, o1, o2, n1, n2) #endif /* ASM_X86_CMPXCHG_H */ diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index fbebb07dd80..53f4b219336 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -166,52 +166,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, #endif -#define cmpxchg8b(ptr, o1, o2, n1, n2) \ -({ \ - char __ret; \ - __typeof__(o2) __dummy; \ - __typeof__(*(ptr)) __old1 = (o1); \ - __typeof__(o2) __old2 = (o2); \ - __typeof__(*(ptr)) __new1 = (n1); \ - __typeof__(o2) __new2 = (n2); \ - asm volatile(LOCK_PREFIX "cmpxchg8b %2; setz %1" \ - : "=d"(__dummy), "=a" (__ret), "+m" (*ptr)\ - : "a" (__old1), "d"(__old2), \ - "b" (__new1), "c" (__new2) \ - : "memory"); \ - __ret; }) - - -#define cmpxchg8b_local(ptr, o1, o2, n1, n2) \ -({ \ - char __ret; \ - __typeof__(o2) __dummy; \ - __typeof__(*(ptr)) __old1 = (o1); \ - __typeof__(o2) __old2 = (o2); \ - __typeof__(*(ptr)) __new1 = (n1); \ - __typeof__(o2) __new2 = (n2); \ - asm volatile("cmpxchg8b %2; setz %1" \ - : "=d"(__dummy), "=a"(__ret), "+m" (*ptr)\ - : "a" (__old), "d"(__old2), \ - "b" (__new1), "c" (__new2), \ - : "memory"); \ - __ret; }) - - -#define cmpxchg_double(ptr, o1, o2, n1, n2) \ -({ \ - BUILD_BUG_ON(sizeof(*(ptr)) != 4); \ - VM_BUG_ON((unsigned long)(ptr) % 8); \ - cmpxchg8b((ptr), (o1), (o2), (n1), (n2)); \ -}) - -#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \ -({ \ - BUILD_BUG_ON(sizeof(*(ptr)) != 4); \ - VM_BUG_ON((unsigned long)(ptr) % 8); \ - cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \ -}) - #define system_has_cmpxchg_double() cpu_has_cx8 #endif /* _ASM_X86_CMPXCHG_32_H */ diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 285da02c38f..614be87f1a9 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -20,49 +20,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) cmpxchg_local((ptr), (o), (n)); \ }) -#define cmpxchg16b(ptr, o1, o2, n1, n2) \ -({ \ - char __ret; \ - __typeof__(o2) __junk; \ - __typeof__(*(ptr)) __old1 = (o1); \ - __typeof__(o2) __old2 = (o2); \ - __typeof__(*(ptr)) __new1 = (n1); \ - __typeof__(o2) __new2 = (n2); \ - asm volatile(LOCK_PREFIX "cmpxchg16b %2;setz %1" \ - : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \ - : "b"(__new1), "c"(__new2), \ - "a"(__old1), "d"(__old2)); \ - __ret; }) - - -#define cmpxchg16b_local(ptr, o1, o2, n1, n2) \ -({ \ - char __ret; \ - __typeof__(o2) __junk; \ - __typeof__(*(ptr)) __old1 = (o1); \ - __typeof__(o2) __old2 = (o2); \ - __typeof__(*(ptr)) __new1 = (n1); \ - __typeof__(o2) __new2 = (n2); \ - asm volatile("cmpxchg16b %2;setz %1" \ - : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \ - : "b"(__new1), "c"(__new2), \ - "a"(__old1), "d"(__old2)); \ - __ret; }) - -#define cmpxchg_double(ptr, o1, o2, n1, n2) \ -({ \ - BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ - VM_BUG_ON((unsigned long)(ptr) % 16); \ - cmpxchg16b((ptr), (o1), (o2), (n1), (n2)); \ -}) - -#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \ -({ \ - BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ - VM_BUG_ON((unsigned long)(ptr) % 16); \ - cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \ -}) - #define system_has_cmpxchg_double() cpu_has_cx16 #endif /* _ASM_X86_CMPXCHG_64_H */ diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index 9a2d644c08e..ced283ac79d 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -4,6 +4,7 @@ #ifdef CONFIG_X86_32 #include <linux/types.h> +#include <linux/log2.h> /* * do_div() is NOT a C function. It wants to return @@ -21,15 +22,20 @@ ({ \ unsigned long __upper, __low, __high, __mod, __base; \ __base = (base); \ - asm("":"=a" (__low), "=d" (__high) : "A" (n)); \ - __upper = __high; \ - if (__high) { \ - __upper = __high % (__base); \ - __high = __high / (__base); \ + if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \ + __mod = n & (__base - 1); \ + n >>= ilog2(__base); \ + } else { \ + asm("" : "=a" (__low), "=d" (__high) : "A" (n));\ + __upper = __high; \ + if (__high) { \ + __upper = __high % (__base); \ + __high = __high / (__base); \ + } \ + asm("divl %2" : "=a" (__low), "=d" (__mod) \ + : "rm" (__base), "0" (__low), "1" (__upper)); \ + asm("" : "=A" (n) : "a" (__low), "d" (__high)); \ } \ - asm("divl %2":"=a" (__low), "=d" (__mod) \ - : "rm" (__base), "0" (__low), "1" (__upper)); \ - asm("":"=A" (n) : "a" (__low), "d" (__high)); \ __mod; \ }) diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 908b96957d8..37782566af2 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -117,7 +117,7 @@ static inline void early_memtest(unsigned long start, unsigned long end) extern unsigned long e820_end_of_ram_pfn(void); extern unsigned long e820_end_of_low_ram_pfn(void); -extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); +extern u64 early_reserve_e820(u64 sizet, u64 align); void memblock_x86_fill(void); void memblock_find_dma_reserve(void); diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 55e4de613f0..da0b3ca815b 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -11,6 +11,7 @@ typedef struct { #ifdef CONFIG_X86_LOCAL_APIC unsigned int apic_timer_irqs; /* arch dependent */ unsigned int irq_spurious_count; + unsigned int icr_read_retry_count; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index c9e09ea0564..6919e936345 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -218,7 +218,7 @@ static inline void fpu_fxsave(struct fpu *fpu) #ifdef CONFIG_SMP #define safe_address (__per_cpu_offset[0]) #else -#define safe_address (kstat_cpu(0).cpustat.user) +#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER]) #endif /* diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 88c765e1641..74df3f1eddf 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -137,6 +137,13 @@ static inline int insn_is_avx(struct insn *insn) return (insn->vex_prefix.value != 0); } +/* Ensure this instruction is decoded completely */ +static inline int insn_complete(struct insn *insn) +{ + return insn->opcode.got && insn->modrm.got && insn->sib.got && + insn->displacement.got && insn->immediate.got; +} + static inline insn_byte_t insn_vex_m_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ diff --git a/arch/x86/include/asm/mach_timer.h b/arch/x86/include/asm/mach_timer.h index 853728519ae..88d0c3c74c1 100644 --- a/arch/x86/include/asm/mach_timer.h +++ b/arch/x86/include/asm/mach_timer.h @@ -15,7 +15,7 @@ #define CALIBRATE_TIME_MSEC 30 /* 30 msecs */ #define CALIBRATE_LATCH \ - ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) + ((PIT_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) static inline void mach_prepare_counter(void) { diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index 01fdf5674e2..0e8e85bb7c5 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -81,8 +81,8 @@ static inline unsigned char current_lock_cmos_reg(void) #else #define lock_cmos_prefix(reg) do {} while (0) #define lock_cmos_suffix(reg) do {} while (0) -#define lock_cmos(reg) -#define unlock_cmos() +#define lock_cmos(reg) do { } while (0) +#define unlock_cmos() do { } while (0) #define do_i_have_lock_cmos() 0 #define current_lock_cmos_reg() 0 #endif diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 0e8ae57d365..6add827381c 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -50,10 +50,11 @@ #define MCJ_CTX_MASK 3 #define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) #define MCJ_CTX_RANDOM 0 /* inject context: random */ -#define MCJ_CTX_PROCESS 1 /* inject context: process */ -#define MCJ_CTX_IRQ 2 /* inject context: IRQ */ -#define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */ -#define MCJ_EXCEPTION 8 /* raise as exception */ +#define MCJ_CTX_PROCESS 0x1 /* inject context: process */ +#define MCJ_CTX_IRQ 0x2 /* inject context: IRQ */ +#define MCJ_NMI_BROADCAST 0x4 /* do NMI broadcasting */ +#define MCJ_EXCEPTION 0x8 /* raise as exception */ +#define MCJ_IRQ_BRAODCAST 0x10 /* do IRQ broadcasting */ /* Fields are zero when not available */ struct mce { @@ -120,7 +121,8 @@ struct mce_log { #ifdef __KERNEL__ -extern struct atomic_notifier_head x86_mce_decoder_chain; +extern void mce_register_decode_chain(struct notifier_block *nb); +extern void mce_unregister_decode_chain(struct notifier_block *nb); #include <linux/percpu.h> #include <linux/init.h> diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h deleted file mode 100644 index 0cd3800f33b..00000000000 --- a/arch/x86/include/asm/memblock.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _X86_MEMBLOCK_H -#define _X86_MEMBLOCK_H - -#define ARCH_DISCARD_MEMBLOCK - -u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align); - -void memblock_x86_reserve_range(u64 start, u64 end, char *name); -void memblock_x86_free_range(u64 start, u64 end); -struct range; -int __get_free_all_memory_range(struct range **range, int nodeid, - unsigned long start_pfn, unsigned long end_pfn); -int get_free_all_memory_range(struct range **rangep, int nodeid); - -void memblock_x86_register_active_regions(int nid, unsigned long start_pfn, - unsigned long last_pfn); -u64 memblock_x86_hole_size(u64 start, u64 end); -u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align); -u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit); -u64 memblock_x86_memory_in_range(u64 addr, u64 limit); -bool memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align); - -#endif diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 24215072d0e..4ebe157bf73 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -48,6 +48,7 @@ static inline struct microcode_ops * __init init_intel_microcode(void) #ifdef CONFIG_MICROCODE_AMD extern struct microcode_ops * __init init_amd_microcode(void); +extern void __exit exit_amd_microcode(void); static inline void get_ucode_data(void *to, const u8 *from, size_t n) { @@ -59,6 +60,7 @@ static inline struct microcode_ops * __init init_amd_microcode(void) { return NULL; } +static inline void __exit exit_amd_microcode(void) {} #endif #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/numachip/numachip_csr.h b/arch/x86/include/asm/numachip/numachip_csr.h new file mode 100644 index 00000000000..660f843df92 --- /dev/null +++ b/arch/x86/include/asm/numachip/numachip_csr.h @@ -0,0 +1,167 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-Specific Header file + * + * Copyright (C) 2011 Numascale AS. All rights reserved. + * + * Send feedback to <support@numascale.com> + * + */ + +#ifndef _ASM_X86_NUMACHIP_NUMACHIP_CSR_H +#define _ASM_X86_NUMACHIP_NUMACHIP_CSR_H + +#include <linux/numa.h> +#include <linux/percpu.h> +#include <linux/io.h> +#include <linux/swab.h> +#include <asm/types.h> +#include <asm/processor.h> + +#define CSR_NODE_SHIFT 16 +#define CSR_NODE_BITS(p) (((unsigned long)(p)) << CSR_NODE_SHIFT) +#define CSR_NODE_MASK 0x0fff /* 4K nodes */ + +/* 32K CSR space, b15 indicates geo/non-geo */ +#define CSR_OFFSET_MASK 0x7fffUL + +/* Global CSR space covers all 4K possible nodes with 64K CSR space per node */ +#define NUMACHIP_GCSR_BASE 0x3fff00000000ULL +#define NUMACHIP_GCSR_LIM 0x3fff0fffffffULL +#define NUMACHIP_GCSR_SIZE (NUMACHIP_GCSR_LIM - NUMACHIP_GCSR_BASE + 1) + +/* + * Local CSR space starts in global CSR space with "nodeid" = 0xfff0, however + * when using the direct mapping on x86_64, both start and size needs to be + * aligned with PMD_SIZE which is 2M + */ +#define NUMACHIP_LCSR_BASE 0x3ffffe000000ULL +#define NUMACHIP_LCSR_LIM 0x3fffffffffffULL +#define NUMACHIP_LCSR_SIZE (NUMACHIP_LCSR_LIM - NUMACHIP_LCSR_BASE + 1) + +static inline void *gcsr_address(int node, unsigned long offset) +{ + return __va(NUMACHIP_GCSR_BASE | (1UL << 15) | + CSR_NODE_BITS(node & CSR_NODE_MASK) | (offset & CSR_OFFSET_MASK)); +} + +static inline void *lcsr_address(unsigned long offset) +{ + return __va(NUMACHIP_LCSR_BASE | (1UL << 15) | + CSR_NODE_BITS(0xfff0) | (offset & CSR_OFFSET_MASK)); +} + +static inline unsigned int read_gcsr(int node, unsigned long offset) +{ + return swab32(readl(gcsr_address(node, offset))); +} + +static inline void write_gcsr(int node, unsigned long offset, unsigned int val) +{ + writel(swab32(val), gcsr_address(node, offset)); +} + +static inline unsigned int read_lcsr(unsigned long offset) +{ + return swab32(readl(lcsr_address(offset))); +} + +static inline void write_lcsr(unsigned long offset, unsigned int val) +{ + writel(swab32(val), lcsr_address(offset)); +} + +/* ========================================================================= */ +/* CSR_G0_STATE_CLEAR */ +/* ========================================================================= */ + +#define CSR_G0_STATE_CLEAR (0x000 + (0 << 12)) +union numachip_csr_g0_state_clear { + unsigned int v; + struct numachip_csr_g0_state_clear_s { + unsigned int _state:2; + unsigned int _rsvd_2_6:5; + unsigned int _lost:1; + unsigned int _rsvd_8_31:24; + } s; +}; + +/* ========================================================================= */ +/* CSR_G0_NODE_IDS */ +/* ========================================================================= */ + +#define CSR_G0_NODE_IDS (0x008 + (0 << 12)) +union numachip_csr_g0_node_ids { + unsigned int v; + struct numachip_csr_g0_node_ids_s { + unsigned int _initialid:16; + unsigned int _nodeid:12; + unsigned int _rsvd_28_31:4; + } s; +}; + +/* ========================================================================= */ +/* CSR_G3_EXT_IRQ_GEN */ +/* ========================================================================= */ + +#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12)) +union numachip_csr_g3_ext_irq_gen { + unsigned int v; + struct numachip_csr_g3_ext_irq_gen_s { + unsigned int _vector:8; + unsigned int _msgtype:3; + unsigned int _index:5; + unsigned int _destination_apic_id:16; + } s; +}; + +/* ========================================================================= */ +/* CSR_G3_EXT_IRQ_STATUS */ +/* ========================================================================= */ + +#define CSR_G3_EXT_IRQ_STATUS (0x034 + (3 << 12)) +union numachip_csr_g3_ext_irq_status { + unsigned int v; + struct numachip_csr_g3_ext_irq_status_s { + unsigned int _result:32; + } s; +}; + +/* ========================================================================= */ +/* CSR_G3_EXT_IRQ_DEST */ +/* ========================================================================= */ + +#define CSR_G3_EXT_IRQ_DEST (0x038 + (3 << 12)) +union numachip_csr_g3_ext_irq_dest { + unsigned int v; + struct numachip_csr_g3_ext_irq_dest_s { + unsigned int _irq:8; + unsigned int _rsvd_8_31:24; + } s; +}; + +/* ========================================================================= */ +/* CSR_G3_NC_ATT_MAP_SELECT */ +/* ========================================================================= */ + +#define CSR_G3_NC_ATT_MAP_SELECT (0x7fc + (3 << 12)) +union numachip_csr_g3_nc_att_map_select { + unsigned int v; + struct numachip_csr_g3_nc_att_map_select_s { + unsigned int _upper_address_bits:4; + unsigned int _select_ram:4; + unsigned int _rsvd_8_31:24; + } s; +}; + +/* ========================================================================= */ +/* CSR_G3_NC_ATT_MAP_SELECT_0-255 */ +/* ========================================================================= */ + +#define CSR_G3_NC_ATT_MAP_SELECT_0 (0x800 + (3 << 12)) + +#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */ + diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 3470c9d0ebb..529bf07e806 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -451,23 +451,20 @@ do { \ #endif /* !CONFIG_M386 */ #ifdef CONFIG_X86_CMPXCHG64 -#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \ +#define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \ ({ \ - char __ret; \ - typeof(o1) __o1 = o1; \ - typeof(o1) __n1 = n1; \ - typeof(o2) __o2 = o2; \ - typeof(o2) __n2 = n2; \ - typeof(o2) __dummy = n2; \ + bool __ret; \ + typeof(pcp1) __o1 = (o1), __n1 = (n1); \ + typeof(pcp2) __o2 = (o2), __n2 = (n2); \ asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \ - : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \ - : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \ + : "=a" (__ret), "+m" (pcp1), "+m" (pcp2), "+d" (__o2) \ + : "b" (__n1), "c" (__n2), "a" (__o1)); \ __ret; \ }) -#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) -#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) -#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#define __this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double +#define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double +#define irqsafe_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double #endif /* CONFIG_X86_CMPXCHG64 */ /* @@ -508,31 +505,23 @@ do { \ * it in software. The address used in the cmpxchg16 instruction must be * aligned to a 16 byte boundary. */ -#ifdef CONFIG_SMP -#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3 -#else -#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2 -#endif -#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \ +#define percpu_cmpxchg16b_double(pcp1, pcp2, o1, o2, n1, n2) \ ({ \ - char __ret; \ - typeof(o1) __o1 = o1; \ - typeof(o1) __n1 = n1; \ - typeof(o2) __o2 = o2; \ - typeof(o2) __n2 = n2; \ - typeof(o2) __dummy; \ - alternative_io(CMPXCHG16B_EMU_CALL, \ - "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \ + bool __ret; \ + typeof(pcp1) __o1 = (o1), __n1 = (n1); \ + typeof(pcp2) __o2 = (o2), __n2 = (n2); \ + alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \ + "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t", \ X86_FEATURE_CX16, \ - ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ - "S" (&pcp1), "b"(__n1), "c"(__n2), \ - "a"(__o1), "d"(__o2) : "memory"); \ + ASM_OUTPUT2("=a" (__ret), "+m" (pcp1), \ + "+m" (pcp2), "+d" (__o2)), \ + "b" (__n1), "c" (__n2), "a" (__o1) : "rsi"); \ __ret; \ }) -#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) -#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) -#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) +#define __this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double +#define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double +#define irqsafe_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double #endif diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index f61c62f7d5d..096c975e099 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -57,6 +57,7 @@ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) #define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 +#define ARCH_PERFMON_EVENTS_COUNT 7 /* * Intel "Architectural Performance Monitoring" CPUID @@ -72,6 +73,19 @@ union cpuid10_eax { unsigned int full; }; +union cpuid10_ebx { + struct { + unsigned int no_unhalted_core_cycles:1; + unsigned int no_instructions_retired:1; + unsigned int no_unhalted_reference_cycles:1; + unsigned int no_llc_reference:1; + unsigned int no_llc_misses:1; + unsigned int no_branch_instruction_retired:1; + unsigned int no_branch_misses_retired:1; + } split; + unsigned int full; +}; + union cpuid10_edx { struct { unsigned int num_counters_fixed:5; @@ -81,6 +95,15 @@ union cpuid10_edx { unsigned int full; }; +struct x86_pmu_capability { + int version; + int num_counters_gp; + int num_counters_fixed; + int bit_width_gp; + int bit_width_fixed; + unsigned int events_mask; + int events_mask_len; +}; /* * Fixed-purpose performance events: @@ -89,23 +112,24 @@ union cpuid10_edx { /* * All 3 fixed-mode PMCs are configured via this single MSR: */ -#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d +#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d /* * The counts are available in three separate MSRs: */ /* Instr_Retired.Any: */ -#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 -#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) +#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 +#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) /* CPU_CLK_Unhalted.Core: */ -#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a -#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) +#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a +#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) /* CPU_CLK_Unhalted.Ref: */ -#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b -#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) +#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b +#define X86_PMC_IDX_FIXED_REF_CYCLES (X86_PMC_IDX_FIXED + 2) +#define X86_PMC_MSK_FIXED_REF_CYCLES (1ULL << X86_PMC_IDX_FIXED_REF_CYCLES) /* * We model BTS tracing as another fixed-mode PMC. @@ -202,6 +226,7 @@ struct perf_guest_switch_msr { }; extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); +extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); #else static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) { @@ -209,6 +234,11 @@ static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) return NULL; } +static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) +{ + memset(cap, 0, sizeof(*cap)); +} + static inline void perf_events_lapic_init(void) { } #endif diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 18601c86fab..49afb3f41eb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -703,7 +703,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, pte_update(mm, addr, ptep); } -#define flush_tlb_fix_spurious_fault(vma, address) +#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0) #define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 2dddb317bb3..f8ab3eaad12 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -6,6 +6,7 @@ * EFLAGS bits */ #define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ +#define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */ #define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ #define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */ #define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b650435ffb5..aa9088c2693 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -99,7 +99,6 @@ struct cpuinfo_x86 { u16 apicid; u16 initial_apicid; u16 x86_clflush_size; -#ifdef CONFIG_SMP /* number of cores as seen by the OS: */ u16 booted_cores; /* Physical processor id: */ @@ -110,7 +109,6 @@ struct cpuinfo_x86 { u8 compute_unit_id; /* Index into per_cpu list: */ u16 cpu_index; -#endif u32 microcode; } __attribute__((__aligned__(SMP_CACHE_BYTES))); diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 972c260919a..a82c2bf504b 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -79,23 +79,10 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; } -#if (NR_CPUS < 256) static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) { - asm volatile(UNLOCK_LOCK_PREFIX "incb %0" - : "+m" (lock->head_tail) - : - : "memory", "cc"); + __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX); } -#else -static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) -{ - asm volatile(UNLOCK_LOCK_PREFIX "incw %0" - : "+m" (lock->head_tail) - : - : "memory", "cc"); -} -#endif static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a1fe5c127b5..185b719ec61 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -40,7 +40,8 @@ struct thread_info { */ __u8 supervisor_stack[0]; #endif - int uaccess_err; + int sig_on_uaccess_error:1; + int uaccess_err:1; /* uaccess failed */ }; #define INIT_THREAD_INFO(tsk) \ @@ -231,6 +232,12 @@ static inline struct thread_info *current_thread_info(void) movq PER_CPU_VAR(kernel_stack),reg ; \ subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg +/* + * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in + * a certain register (to be used in assembler memory operands). + */ +#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg) + #endif #endif /* !X86_32 */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index c00692476e9..800f77c6005 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -130,10 +130,8 @@ extern void setup_node_to_cpumask_map(void); .balance_interval = 1, \ } -#ifdef CONFIG_X86_64 extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) -#endif #else /* !CONFIG_NUMA */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 83e2efd181e..15d99153a96 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -51,6 +51,8 @@ extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern unsigned long native_calibrate_tsc(void); +extern int tsc_clocksource_reliable; + /* * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 36361bf6fdd..8be5f54d936 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -462,7 +462,7 @@ struct __large_struct { unsigned long buf[100]; }; barrier(); #define uaccess_catch(err) \ - (err) |= current_thread_info()->uaccess_err; \ + (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \ current_thread_info()->uaccess_err = prev_err; \ } while (0) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 1971e652d24..1ac860a0984 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -7,6 +7,7 @@ struct mpc_bus; struct mpc_cpu; struct mpc_table; +struct cpuinfo_x86; /** * struct x86_init_mpparse - platform specific mpparse ops @@ -147,6 +148,7 @@ struct x86_init_ops { */ struct x86_cpuinit_ops { void (*setup_percpu_clockev)(void); + void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node); }; /** @@ -186,5 +188,6 @@ extern struct x86_msi_ops x86_msi; extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); +extern void x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node); #endif diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4558f0d0822..ce664f33ea8 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -219,6 +219,8 @@ static int __init acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_local_x2apic *processor = NULL; + int apic_id; + u8 enabled; processor = (struct acpi_madt_local_x2apic *)header; @@ -227,6 +229,8 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) acpi_table_print_madt_entry(header); + apic_id = processor->local_apic_id; + enabled = processor->lapic_flags & ACPI_MADT_ENABLED; #ifdef CONFIG_X86_X2APIC /* * We need to register disabled CPU as well to permit @@ -235,8 +239,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - acpi_register_lapic(processor->local_apic_id, /* APIC ID */ - processor->lapic_flags & ACPI_MADT_ENABLED); + if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled) + printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); + else + acpi_register_lapic(apic_id, enabled); #else printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); #endif diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 4c39baa8fac..013c1810ce7 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -123,16 +123,14 @@ int amd_get_subcaches(int cpu) { struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; unsigned int mask; - int cuid = 0; + int cuid; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return 0; pci_read_config_dword(link, 0x1d4, &mask); -#ifdef CONFIG_SMP cuid = cpu_data(cpu).compute_unit_id; -#endif return (mask >> (4 * cuid)) & 0xf; } @@ -141,7 +139,7 @@ int amd_set_subcaches(int cpu, int mask) static unsigned int reset, ban; struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); unsigned int reg; - int cuid = 0; + int cuid; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) return -EINVAL; @@ -159,9 +157,7 @@ int amd_set_subcaches(int cpu, int mask) pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); } -#ifdef CONFIG_SMP cuid = cpu_data(cpu).compute_unit_id; -#endif mask <<= 4 * cuid; mask |= (0xf ^ (1 << cuid)) << 26; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 3d2661ca654..6e76c191a83 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -88,13 +88,13 @@ static u32 __init allocate_aperture(void) */ addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, aper_size, aper_size); - if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) { + if (!addr || addr + aper_size > GART_MAX_ADDR) { printk(KERN_ERR "Cannot allocate aperture memory hole (%lx,%uK)\n", addr, aper_size>>10); return 0; } - memblock_x86_reserve_range(addr, addr + aper_size, "aperture64"); + memblock_reserve(addr, aper_size); /* * Kmemleak should not scan this block as it may not be mapped via the * kernel direct mapping. diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 767fd04f284..0ae0323b1f9 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_SMP) += ipi.o ifeq ($(CONFIG_X86_64),y) # APIC probe will depend on the listing order here +obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o obj-$(CONFIG_X86_UV) += x2apic_uv_x.o obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2cd2d93643d..2eec05b6d1b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -146,16 +146,26 @@ __setup("apicpmtimer", setup_apicpmtimer); int x2apic_mode; #ifdef CONFIG_X86_X2APIC /* x2apic enabled before OS handover */ -static int x2apic_preenabled; +int x2apic_preenabled; +static int x2apic_disabled; +static int nox2apic; static __init int setup_nox2apic(char *str) { if (x2apic_enabled()) { - pr_warning("Bios already enabled x2apic, " - "can't enforce nox2apic"); - return 0; - } + int apicid = native_apic_msr_read(APIC_ID); + + if (apicid >= 255) { + pr_warning("Apicid: %08x, cannot enforce nox2apic\n", + apicid); + return 0; + } + + pr_warning("x2apic already enabled. will disable it\n"); + } else + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + + nox2apic = 1; - setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; } early_param("nox2apic", setup_nox2apic); @@ -250,6 +260,7 @@ u32 native_safe_apic_wait_icr_idle(void) send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; if (!send_status) break; + inc_irq_stat(icr_read_retry_count); udelay(100); } while (timeout++ < 1000); @@ -1431,6 +1442,45 @@ void __init bsp_end_local_APIC_setup(void) } #ifdef CONFIG_X86_X2APIC +/* + * Need to disable xapic and x2apic at the same time and then enable xapic mode + */ +static inline void __disable_x2apic(u64 msr) +{ + wrmsrl(MSR_IA32_APICBASE, + msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); + wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); +} + +static __init void disable_x2apic(void) +{ + u64 msr; + + if (!cpu_has_x2apic) + return; + + rdmsrl(MSR_IA32_APICBASE, msr); + if (msr & X2APIC_ENABLE) { + u32 x2apic_id = read_apic_id(); + + if (x2apic_id >= 255) + panic("Cannot disable x2apic, id: %08x\n", x2apic_id); + + pr_info("Disabling x2apic\n"); + __disable_x2apic(msr); + + if (nox2apic) { + clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC); + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + } + + x2apic_disabled = 1; + x2apic_mode = 0; + + register_lapic_address(mp_lapic_addr); + } +} + void check_x2apic(void) { if (x2apic_enabled()) { @@ -1441,15 +1491,20 @@ void check_x2apic(void) void enable_x2apic(void) { - int msr, msr2; + u64 msr; + + rdmsrl(MSR_IA32_APICBASE, msr); + if (x2apic_disabled) { + __disable_x2apic(msr); + return; + } if (!x2apic_mode) return; - rdmsr(MSR_IA32_APICBASE, msr, msr2); if (!(msr & X2APIC_ENABLE)) { printk_once(KERN_INFO "Enabling x2apic\n"); - wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2); + wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); } } #endif /* CONFIG_X86_X2APIC */ @@ -1486,25 +1541,34 @@ void __init enable_IR_x2apic(void) ret = save_ioapic_entries(); if (ret) { pr_info("Saving IO-APIC state failed: %d\n", ret); - goto out; + return; } local_irq_save(flags); legacy_pic->mask_all(); mask_ioapic_entries(); + if (x2apic_preenabled && nox2apic) + disable_x2apic(); + if (dmar_table_init_ret) ret = -1; else ret = enable_IR(); + if (!x2apic_supported()) + goto skip_x2apic; + if (ret < 0) { /* IR is required if there is APIC ID > 255 even when running * under KVM */ if (max_physical_apicid > 255 || - !hypervisor_x2apic_available()) - goto nox2apic; + !hypervisor_x2apic_available()) { + if (x2apic_preenabled) + disable_x2apic(); + goto skip_x2apic; + } /* * without IR all CPUs can be addressed by IOAPIC/MSI * only in physical mode @@ -1512,8 +1576,10 @@ void __init enable_IR_x2apic(void) x2apic_force_phys(); } - if (ret == IRQ_REMAP_XAPIC_MODE) - goto nox2apic; + if (ret == IRQ_REMAP_XAPIC_MODE) { + pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); + goto skip_x2apic; + } x2apic_enabled = 1; @@ -1523,22 +1589,11 @@ void __init enable_IR_x2apic(void) pr_info("Enabled x2apic\n"); } -nox2apic: +skip_x2apic: if (ret < 0) /* IR enabling failed */ restore_ioapic_entries(); legacy_pic->restore_mask(); local_irq_restore(flags); - -out: - if (x2apic_enabled || !x2apic_supported()) - return; - - if (x2apic_preenabled) - panic("x2apic: enabled by BIOS but kernel init failed."); - else if (ret == IRQ_REMAP_XAPIC_MODE) - pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); - else if (ret < 0) - pr_info("x2apic not enabled, IRQ remapping init failed\n"); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index f7a41e4cae4..8c3cdded6f2 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -62,7 +62,7 @@ static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel * document number 292116). So here it goes... */ -static void flat_init_apic_ldr(void) +void flat_init_apic_ldr(void) { unsigned long val; unsigned long num, id; @@ -171,9 +171,14 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } +static int flat_probe(void) +{ + return 1; +} + static struct apic apic_flat = { .name = "flat", - .probe = NULL, + .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, .apic_id_registered = flat_apic_id_registered, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c new file mode 100644 index 00000000000..09d3d8c1cd9 --- /dev/null +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -0,0 +1,294 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-Specific APIC Code + * + * Copyright (C) 2011 Numascale AS. All rights reserved. + * + * Send feedback to <support@numascale.com> + * + */ + +#include <linux/errno.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/hardirq.h> +#include <linux/delay.h> + +#include <asm/numachip/numachip_csr.h> +#include <asm/smp.h> +#include <asm/apic.h> +#include <asm/ipi.h> +#include <asm/apic_flat_64.h> + +static int numachip_system __read_mostly; + +static struct apic apic_numachip __read_mostly; + +static unsigned int get_apic_id(unsigned long x) +{ + unsigned long value; + unsigned int id; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U); + + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + x = ((id & 0xffU) << 24); + return x; +} + +static unsigned int read_xapic_id(void) +{ + return get_apic_id(apic_read(APIC_ID)); +} + +static int numachip_apic_id_registered(void) +{ + return physid_isset(read_xapic_id(), phys_cpu_present_map); +} + +static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return initial_apic_id >> index_msb; +} + +static const struct cpumask *numachip_target_cpus(void) +{ + return cpu_online_mask; +} + +static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); +} + +static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +{ + union numachip_csr_g3_ext_irq_gen int_gen; + + int_gen.s._destination_apic_id = phys_apicid; + int_gen.s._vector = 0; + int_gen.s._msgtype = APIC_DM_INIT >> 8; + int_gen.s._index = 0; + + write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); + + int_gen.s._msgtype = APIC_DM_STARTUP >> 8; + int_gen.s._vector = start_rip >> 12; + + write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); + + atomic_set(&init_deasserted, 1); + return 0; +} + +static void numachip_send_IPI_one(int cpu, int vector) +{ + union numachip_csr_g3_ext_irq_gen int_gen; + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + + int_gen.s._destination_apic_id = apicid; + int_gen.s._vector = vector; + int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8; + int_gen.s._index = 0; + + write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); +} + +static void numachip_send_IPI_mask(const struct cpumask *mask, int vector) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) + numachip_send_IPI_one(cpu, vector); +} + +static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (cpu != this_cpu) + numachip_send_IPI_one(cpu, vector); + } +} + +static void numachip_send_IPI_allbutself(int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != this_cpu) + numachip_send_IPI_one(cpu, vector); + } +} + +static void numachip_send_IPI_all(int vector) +{ + numachip_send_IPI_mask(cpu_online_mask, vector); +} + +static void numachip_send_IPI_self(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +} + +static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = cpumask_first(cpumask); + if (likely((unsigned)cpu < nr_cpu_ids)) + return per_cpu(x86_cpu_to_apicid, cpu); + + return BAD_APICID; +} + +static unsigned int +numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) { + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + } + return per_cpu(x86_cpu_to_apicid, cpu); +} + +static int __init numachip_probe(void) +{ + return apic == &apic_numachip; +} + +static void __init map_csrs(void) +{ + printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n", + NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1); + init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); + + printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n", + NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1); + init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE); +} + +static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ + c->phys_proc_id = node; + per_cpu(cpu_llc_id, smp_processor_id()) = node; +} + +static int __init numachip_system_init(void) +{ + unsigned int val; + + if (!numachip_system) + return 0; + + x86_cpuinit.fixup_cpu_id = fixup_cpu_id; + + map_csrs(); + + val = read_lcsr(CSR_G0_NODE_IDS); + printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val); + + return 0; +} +early_initcall(numachip_system_init); + +static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if (!strncmp(oem_id, "NUMASC", 6)) { + numachip_system = 1; + return 1; + } + + return 0; +} + +static struct apic apic_numachip __refconst = { + + .name = "NumaConnect system", + .probe = numachip_probe, + .acpi_madt_oem_check = numachip_acpi_madt_oem_check, + .apic_id_registered = numachip_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 0, /* physical */ + + .target_cpus = numachip_target_cpus, + .disable_esr = 0, + .dest_logical = 0, + .check_apicid_used = NULL, + .check_apicid_present = NULL, + + .vector_allocation_domain = numachip_vector_allocation_domain, + .init_apic_ldr = flat_init_apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .multi_timer_check = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + .phys_pkg_id = numachip_phys_pkg_id, + .mps_oem_check = NULL, + + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = 0xffU << 24, + + .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and, + + .send_IPI_mask = numachip_send_IPI_mask, + .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, + .send_IPI_allbutself = numachip_send_IPI_allbutself, + .send_IPI_all = numachip_send_IPI_all, + .send_IPI_self = numachip_send_IPI_self, + + .wakeup_secondary_cpu = numachip_wakeup_secondary, + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + .wait_for_init_deassert = NULL, + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = NULL, /* REMRD not supported */ + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = native_apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, +}; +apic_driver(apic_numachip); + diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 89805558551..fb072754bc1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2948,6 +2948,10 @@ static inline void __init check_timer(void) } local_irq_disable(); apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); + if (x2apic_preenabled) + apic_printk(APIC_QUIET, KERN_INFO + "Perhaps problem with the pre-enabled x2apic mode\n" + "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " "report. Then try booting with the 'noapic' option.\n"); out: diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 452932d3473..5da1269e8dd 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -62,7 +62,8 @@ early_param("memory_corruption_check_size", set_corruption_check_size); void __init setup_bios_corruption_check(void) { - u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ + phys_addr_t start, end; + u64 i; if (memory_corruption_check == -1) { memory_corruption_check = @@ -82,28 +83,23 @@ void __init setup_bios_corruption_check(void) corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); - while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { - u64 size; - addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE); + for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { + start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), + PAGE_SIZE, corruption_check_size); + end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), + PAGE_SIZE, corruption_check_size); + if (start >= end) + continue; - if (addr == MEMBLOCK_ERROR) - break; - - if (addr >= corruption_check_size) - break; - - if ((addr + size) > corruption_check_size) - size = corruption_check_size - addr; - - memblock_x86_reserve_range(addr, addr + size, "SCAN RAM"); - scan_areas[num_scan_areas].addr = addr; - scan_areas[num_scan_areas].size = size; - num_scan_areas++; + memblock_reserve(start, end - start); + scan_areas[num_scan_areas].addr = start; + scan_areas[num_scan_areas].size = end - start; /* Assume we've already mapped this early memory */ - memset(__va(addr), 0, size); + memset(__va(start), 0, end - start); - addr += size; + if (++num_scan_areas >= MAX_SCAN_AREAS) + break; } if (num_scan_areas) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 0bab2b18bb2..f4773f4aae3 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -148,7 +148,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -192,7 +191,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) valid_k7: ; -#endif } static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) @@ -353,6 +351,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) if (node == NUMA_NO_NODE) node = per_cpu(cpu_llc_id, cpu); + /* + * If core numbers are inconsistent, it's likely a multi-fabric platform, + * so invoke platform-specific handler + */ + if (c->phys_proc_id != node) + x86_cpuinit.fixup_cpu_id(c, node); + if (!node_online(node)) { /* * Two possibilities here: diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e58d978e075..159103c0b1f 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -278,7 +278,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_32 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ - if (c->x86_model >= 6 && c->x86_model <= 9) { + if (c->x86_model >= 6 && c->x86_model <= 13) { rdmsr(MSR_VIA_FCR, lo, hi); lo |= (1<<1 | 1<<7); wrmsr(MSR_VIA_FCR, lo, hi); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index aa003b13a83..850f2963a42 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -676,9 +676,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_early_init) this_cpu->c_early_init(c); -#ifdef CONFIG_SMP c->cpu_index = 0; -#endif filter_cpuid_features(c, false); setup_smep(c); @@ -764,10 +762,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) c->apicid = c->initial_apicid; # endif #endif - -#ifdef CONFIG_X86_HT c->phys_proc_id = c->initial_apicid; -#endif } setup_smep(c); @@ -1141,6 +1136,15 @@ static void dbg_restore_debug_regs(void) #endif /* ! CONFIG_KGDB */ /* + * Prints an error where the NUMA and configured core-number mismatch and the + * platform didn't override this to fix it up + */ +void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ + pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id); +} + +/* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 1b22dcc51af..8bacc7826fb 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -1,5 +1,4 @@ #ifndef ARCH_X86_CPU_H - #define ARCH_X86_CPU_H struct cpu_model_info { @@ -35,6 +34,4 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); -extern void get_cpu_cap(struct cpuinfo_x86 *c); - -#endif +#endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 523131213f0..3e6ff6cbf42 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -181,7 +181,6 @@ static void __cpuinit trap_init_f00f_bug(void) static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -198,7 +197,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) WARN_ONCE(1, "WARNING: SMP operation may be unreliable" "with B stepping processors.\n"); } -#endif } static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 319882ef848..fc4beb39357 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/fs.h> +#include <linux/preempt.h> #include <linux/smp.h> #include <linux/notifier.h> #include <linux/kdebug.h> @@ -92,6 +93,18 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) return NMI_HANDLED; } +static void mce_irq_ipi(void *info) +{ + int cpu = smp_processor_id(); + struct mce *m = &__get_cpu_var(injectm); + + if (cpumask_test_cpu(cpu, mce_inject_cpumask) && + m->inject_flags & MCJ_EXCEPTION) { + cpumask_clear_cpu(cpu, mce_inject_cpumask); + raise_exception(m, NULL); + } +} + /* Inject mce on current CPU */ static int raise_local(void) { @@ -139,9 +152,10 @@ static void raise_mce(struct mce *m) return; #ifdef CONFIG_X86_LOCAL_APIC - if (m->inject_flags & MCJ_NMI_BROADCAST) { + if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) { unsigned long start; int cpu; + get_online_cpus(); cpumask_copy(mce_inject_cpumask, cpu_online_mask); cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); @@ -151,13 +165,25 @@ static void raise_mce(struct mce *m) MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) cpumask_clear_cpu(cpu, mce_inject_cpumask); } - if (!cpumask_empty(mce_inject_cpumask)) - apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); + if (!cpumask_empty(mce_inject_cpumask)) { + if (m->inject_flags & MCJ_IRQ_BRAODCAST) { + /* + * don't wait because mce_irq_ipi is necessary + * to be sync with following raise_local + */ + preempt_disable(); + smp_call_function_many(mce_inject_cpumask, + mce_irq_ipi, NULL, 0); + preempt_enable(); + } else if (m->inject_flags & MCJ_NMI_BROADCAST) + apic->send_IPI_mask(mce_inject_cpumask, + NMI_VECTOR); + } start = jiffies; while (!cpumask_empty(mce_inject_cpumask)) { if (!time_before(jiffies, start + 2*HZ)) { printk(KERN_ERR - "Timeout waiting for mce inject NMI %lx\n", + "Timeout waiting for mce inject %lx\n", *cpumask_bits(mce_inject_cpumask)); break; } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2af127d4c3d..cbe82b5918c 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -95,13 +95,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; -/* - * CPU/chipset specific EDAC code can register a notifier call here to print - * MCE errors in a human-readable form. - */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); -EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); - /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL @@ -109,6 +102,12 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { static DEFINE_PER_CPU(struct work_struct, mce_work); +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -119,9 +118,7 @@ void mce_setup(struct mce *m) m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); -#ifdef CONFIG_SMP m->socketid = cpu_data(m->extcpu).phys_proc_id; -#endif m->apicid = cpu_data(m->extcpu).initial_apicid; rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); } @@ -190,6 +187,57 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } +static void drain_mcelog_buffer(void) +{ + unsigned int next, i, prev = 0; + + next = rcu_dereference_check_mce(mcelog.next); + + do { + struct mce *m; + + /* drain what was logged during boot */ + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + unsigned retries = 1; + + m = &mcelog.entry[i]; + + while (!m->finished) { + if (time_after_eq(jiffies, start + 2*retries)) + retries++; + + cpu_relax(); + + if (!m->finished && retries >= 4) { + pr_err("MCE: skipping error being logged currently!\n"); + break; + } + } + smp_rmb(); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + } + + memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); +} + + +void mce_register_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); + drain_mcelog_buffer(); +} +EXPORT_SYMBOL_GPL(mce_register_decode_chain); + +void mce_unregister_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); + static void print_mce(struct mce *m) { int ret = 0; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f5474218cff..1d76872b6a4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -64,11 +64,9 @@ struct threshold_bank { }; static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); -#ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { 0, 0, 0, 0, 1 }; -#endif static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ @@ -202,10 +200,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!block) per_cpu(bank_map, cpu) |= (1 << bank); -#ifdef CONFIG_SMP if (shared_bank[bank] && c->cpu_core_id) break; -#endif + offset = setup_APIC_mce(offset, (high & MASK_LVTOFF_HI) >> 20); @@ -531,7 +528,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) sprintf(name, "threshold_bank%i", bank); -#ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ i = cpumask_first(cpu_llc_shared_mask(cpu)); @@ -558,7 +554,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } -#endif b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index ce215616d5b..39c6089891e 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -323,17 +323,6 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ -/* - * Set up the most two significant bit to notify mce log that this thermal - * event type. - * This is a temp solution. May be changed in the future with mce log - * infrasture. - */ -#define CORE_THROTTLED (0) -#define CORE_POWER_LIMIT ((__u64)1 << 62) -#define PACKAGE_THROTTLED ((__u64)2 << 62) -#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) - static void notify_thresholds(__u64 msr_val) { /* check whether the interrupt handler is defined; @@ -363,27 +352,23 @@ static void intel_thermal_interrupt(void) if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, CORE_LEVEL) != 0) - mce_log_therm_throt_event(CORE_THROTTLED | msr_val); + mce_log_therm_throt_event(msr_val); if (this_cpu_has(X86_FEATURE_PLN)) - if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, - CORE_LEVEL) != 0) - mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); + CORE_LEVEL); if (this_cpu_has(X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, - PACKAGE_LEVEL) != 0) - mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); + PACKAGE_LEVEL); if (this_cpu_has(X86_FEATURE_PLN)) - if (therm_throt_process(msr_val & + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, - PACKAGE_LEVEL) != 0) - mce_log_therm_throt_event(PACKAGE_POWER_LIMIT - | msr_val); + PACKAGE_LEVEL); } } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2bda212a001..5adce1040b1 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -484,18 +484,195 @@ static inline int is_x86_event(struct perf_event *event) return event->pmu == &pmu; } +/* + * Event scheduler state: + * + * Assign events iterating over all events and counters, beginning + * with events with least weights first. Keep the current iterator + * state in struct sched_state. + */ +struct sched_state { + int weight; + int event; /* event index */ + int counter; /* counter index */ + int unassigned; /* number of events to be assigned left */ + unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; +}; + +/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ +#define SCHED_STATES_MAX 2 + +struct perf_sched { + int max_weight; + int max_events; + struct event_constraint **constraints; + struct sched_state state; + int saved_states; + struct sched_state saved[SCHED_STATES_MAX]; +}; + +/* + * Initialize interator that runs through all events and counters. + */ +static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c, + int num, int wmin, int wmax) +{ + int idx; + + memset(sched, 0, sizeof(*sched)); + sched->max_events = num; + sched->max_weight = wmax; + sched->constraints = c; + + for (idx = 0; idx < num; idx++) { + if (c[idx]->weight == wmin) + break; + } + + sched->state.event = idx; /* start with min weight */ + sched->state.weight = wmin; + sched->state.unassigned = num; +} + +static void perf_sched_save_state(struct perf_sched *sched) +{ + if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) + return; + + sched->saved[sched->saved_states] = sched->state; + sched->saved_states++; +} + +static bool perf_sched_restore_state(struct perf_sched *sched) +{ + if (!sched->saved_states) + return false; + + sched->saved_states--; + sched->state = sched->saved[sched->saved_states]; + + /* continue with next counter: */ + clear_bit(sched->state.counter++, sched->state.used); + + return true; +} + +/* + * Select a counter for the current event to schedule. Return true on + * success. + */ +static bool __perf_sched_find_counter(struct perf_sched *sched) +{ + struct event_constraint *c; + int idx; + + if (!sched->state.unassigned) + return false; + + if (sched->state.event >= sched->max_events) + return false; + + c = sched->constraints[sched->state.event]; + + /* Prefer fixed purpose counters */ + if (x86_pmu.num_counters_fixed) { + idx = X86_PMC_IDX_FIXED; + for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { + if (!__test_and_set_bit(idx, sched->state.used)) + goto done; + } + } + /* Grab the first unused counter starting with idx */ + idx = sched->state.counter; + for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { + if (!__test_and_set_bit(idx, sched->state.used)) + goto done; + } + + return false; + +done: + sched->state.counter = idx; + + if (c->overlap) + perf_sched_save_state(sched); + + return true; +} + +static bool perf_sched_find_counter(struct perf_sched *sched) +{ + while (!__perf_sched_find_counter(sched)) { + if (!perf_sched_restore_state(sched)) + return false; + } + + return true; +} + +/* + * Go through all unassigned events and find the next one to schedule. + * Take events with the least weight first. Return true on success. + */ +static bool perf_sched_next_event(struct perf_sched *sched) +{ + struct event_constraint *c; + + if (!sched->state.unassigned || !--sched->state.unassigned) + return false; + + do { + /* next event */ + sched->state.event++; + if (sched->state.event >= sched->max_events) { + /* next weight */ + sched->state.event = 0; + sched->state.weight++; + if (sched->state.weight > sched->max_weight) + return false; + } + c = sched->constraints[sched->state.event]; + } while (c->weight != sched->state.weight); + + sched->state.counter = 0; /* start with first counter */ + + return true; +} + +/* + * Assign a counter for each event. + */ +static int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign) +{ + struct perf_sched sched; + + perf_sched_init(&sched, constraints, n, wmin, wmax); + + do { + if (!perf_sched_find_counter(&sched)) + break; /* failed */ + if (assign) + assign[sched.state.event] = sched.state.counter; + } while (perf_sched_next_event(&sched)); + + return sched.state.unassigned; +} + int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - int i, j, w, wmax, num = 0; + int i, wmin, wmax, num = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); - for (i = 0; i < n; i++) { + for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); constraints[i] = c; + wmin = min(wmin, c->weight); + wmax = max(wmax, c->weight); } /* @@ -521,60 +698,12 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (assign) assign[i] = hwc->idx; } - if (i == n) - goto done; - - /* - * begin slow path - */ - - bitmap_zero(used_mask, X86_PMC_IDX_MAX); - /* - * weight = number of possible counters - * - * 1 = most constrained, only works on one counter - * wmax = least constrained, works on any counter - * - * assign events to counters starting with most - * constrained events. - */ - wmax = x86_pmu.num_counters; + /* slow path */ + if (i != n) + num = perf_assign_events(constraints, n, wmin, wmax, assign); /* - * when fixed event counters are present, - * wmax is incremented by 1 to account - * for one more choice - */ - if (x86_pmu.num_counters_fixed) - wmax++; - - for (w = 1, num = n; num && w <= wmax; w++) { - /* for each event */ - for (i = 0; num && i < n; i++) { - c = constraints[i]; - hwc = &cpuc->event_list[i]->hw; - - if (c->weight != w) - continue; - - for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { - if (!test_bit(j, used_mask)) - break; - } - - if (j == X86_PMC_IDX_MAX) - break; - - __set_bit(j, used_mask); - - if (assign) - assign[i] = j; - num--; - } - } -done: - /* * scheduling failed or is just a simulation, * free resources if necessary */ @@ -1119,6 +1248,7 @@ static void __init pmu_check_apic(void) static int __init init_hw_perf_events(void) { + struct x86_pmu_quirk *quirk; struct event_constraint *c; int err; @@ -1147,8 +1277,8 @@ static int __init init_hw_perf_events(void) pr_cont("%s PMU driver.\n", x86_pmu.name); - if (x86_pmu.quirks) - x86_pmu.quirks(); + for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) + quirk->func(); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", @@ -1171,12 +1301,18 @@ static int __init init_hw_perf_events(void) unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, - 0, x86_pmu.num_counters); + 0, x86_pmu.num_counters, 0); if (x86_pmu.event_constraints) { + /* + * event on fixed counter2 (REF_CYCLES) only works on this + * counter, so do not extend mask to generic counters + */ for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != X86_RAW_EVENT_MASK) + if (c->cmask != X86_RAW_EVENT_MASK + || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) { continue; + } c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; c->weight += x86_pmu.num_counters; @@ -1566,3 +1702,15 @@ unsigned long perf_misc_flags(struct pt_regs *regs) return misc; } + +void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) +{ + cap->version = x86_pmu.version; + cap->num_counters_gp = x86_pmu.num_counters; + cap->num_counters_fixed = x86_pmu.num_counters_fixed; + cap->bit_width_gp = x86_pmu.cntval_bits; + cap->bit_width_fixed = x86_pmu.cntval_bits; + cap->events_mask = (unsigned int)x86_pmu.events_maskl; + cap->events_mask_len = x86_pmu.events_mask_len; +} +EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index b9698d40ac4..8944062f46e 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -45,6 +45,7 @@ struct event_constraint { u64 code; u64 cmask; int weight; + int overlap; }; struct amd_nb { @@ -151,15 +152,40 @@ struct cpu_hw_events { void *kfree_on_online; }; -#define __EVENT_CONSTRAINT(c, n, m, w) {\ +#define __EVENT_CONSTRAINT(c, n, m, w, o) {\ { .idxmsk64 = (n) }, \ .code = (c), \ .cmask = (m), \ .weight = (w), \ + .overlap = (o), \ } #define EVENT_CONSTRAINT(c, n, m) \ - __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0) + +/* + * The overlap flag marks event constraints with overlapping counter + * masks. This is the case if the counter mask of such an event is not + * a subset of any other counter mask of a constraint with an equal or + * higher weight, e.g.: + * + * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); + * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0); + * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0); + * + * The event scheduler may not select the correct counter in the first + * cycle because it needs to know which subsequent events will be + * scheduled. It may fail to schedule the events then. So we set the + * overlap flag for such constraints to give the scheduler a hint which + * events to select for counter rescheduling. + * + * Care must be taken as the rescheduling algorithm is O(n!) which + * will increase scheduling cycles for an over-commited system + * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros + * and its counter masks must be kept at a minimum. + */ +#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1) /* * Constraint on the Event code. @@ -235,6 +261,11 @@ union perf_capabilities { u64 capabilities; }; +struct x86_pmu_quirk { + struct x86_pmu_quirk *next; + void (*func)(void); +}; + /* * struct x86_pmu - generic x86 pmu */ @@ -259,6 +290,11 @@ struct x86_pmu { int num_counters_fixed; int cntval_bits; u64 cntval_mask; + union { + unsigned long events_maskl; + unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)]; + }; + int events_mask_len; int apic; u64 max_period; struct event_constraint * @@ -268,7 +304,7 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); struct event_constraint *event_constraints; - void (*quirks)(void); + struct x86_pmu_quirk *quirks; int perfctr_second_write; int (*cpu_prepare)(int cpu); @@ -309,6 +345,15 @@ struct x86_pmu { struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); }; +#define x86_add_quirk(func_) \ +do { \ + static struct x86_pmu_quirk __quirk __initdata = { \ + .func = func_, \ + }; \ + __quirk.next = x86_pmu.quirks; \ + x86_pmu.quirks = &__quirk; \ +} while (0) + #define ERF_NO_HT_SHARING 1 #define ERF_HAS_RSP_1 2 diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index aeefd45697a..0397b23be8e 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -492,7 +492,7 @@ static __initconst const struct x86_pmu amd_pmu = { static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); -static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0); +static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 121f1be4da1..3bd37bdf1b8 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -28,6 +28,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ }; static struct event_constraint intel_core_event_constraints[] __read_mostly = @@ -45,12 +46,7 @@ static struct event_constraint intel_core2_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* - * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event - * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed - * ratio between these counters. - */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -68,7 +64,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ @@ -90,7 +86,7 @@ static struct event_constraint intel_westmere_event_constraints[] __read_mostly { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ @@ -102,7 +98,7 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ @@ -125,7 +121,7 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ EVENT_CONSTRAINT_END }; @@ -1519,7 +1515,7 @@ static __initconst const struct x86_pmu intel_pmu = { .guest_get_msrs = intel_guest_get_msrs, }; -static void intel_clovertown_quirks(void) +static __init void intel_clovertown_quirk(void) { /* * PEBS is unreliable due to: @@ -1545,19 +1541,60 @@ static void intel_clovertown_quirks(void) x86_pmu.pebs_constraints = NULL; } -static void intel_sandybridge_quirks(void) +static __init void intel_sandybridge_quirk(void) { printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); x86_pmu.pebs = 0; x86_pmu.pebs_constraints = NULL; } +static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { + { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, + { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, + { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, + { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, + { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, + { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, + { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, +}; + +static __init void intel_arch_events_quirk(void) +{ + int bit; + + /* disable event that reported as not presend by cpuid */ + for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { + intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; + printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", + intel_arch_events_map[bit].name); + } +} + +static __init void intel_nehalem_quirk(void) +{ + union cpuid10_ebx ebx; + + ebx.full = x86_pmu.events_maskl; + if (ebx.split.no_branch_misses_retired) { + /* + * Erratum AAJ80 detected, we work it around by using + * the BR_MISP_EXEC.ANY event. This will over-count + * branch-misses, but it's still much better than the + * architectural event which is often completely bogus: + */ + intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + ebx.split.no_branch_misses_retired = 0; + x86_pmu.events_maskl = ebx.full; + printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); + } +} + __init int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; + union cpuid10_ebx ebx; unsigned int unused; - unsigned int ebx; int version; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { @@ -1574,8 +1611,8 @@ __init int intel_pmu_init(void) * Check whether the Architectural PerfMon supports * Branch Misses Retired hw_event or not. */ - cpuid(10, &eax.full, &ebx, &unused, &edx.full); - if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) return -ENODEV; version = eax.split.version_id; @@ -1589,6 +1626,9 @@ __init int intel_pmu_init(void) x86_pmu.cntval_bits = eax.split.bit_width; x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.events_maskl = ebx.full; + x86_pmu.events_mask_len = eax.split.mask_length; + /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events: @@ -1608,6 +1648,8 @@ __init int intel_pmu_init(void) intel_ds_init(); + x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ + /* * Install the hw-cache-events table: */ @@ -1617,7 +1659,7 @@ __init int intel_pmu_init(void) break; case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - x86_pmu.quirks = intel_clovertown_quirks; + x86_add_quirk(intel_clovertown_quirk); case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ case 29: /* six-core 45 nm xeon "Dunnington" */ @@ -1651,17 +1693,8 @@ __init int intel_pmu_init(void) /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; - if (ebx & 0x40) { - /* - * Erratum AAJ80 detected, we work it around by using - * the BR_MISP_EXEC.ANY event. This will over-count - * branch-misses, but it's still much better than the - * architectural event which is often completely bogus: - */ - intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + x86_add_quirk(intel_nehalem_quirk); - pr_cont("erratum AAJ80 worked around, "); - } pr_cont("Nehalem events, "); break; @@ -1701,7 +1734,7 @@ __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ - x86_pmu.quirks = intel_sandybridge_quirks; + x86_add_quirk(intel_sandybridge_quirk); case 45: /* SandyBridge, "Romely-EP" */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -1738,5 +1771,6 @@ __init int intel_pmu_init(void) break; } } + return 0; } diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c index 5abbea297e0..7b3fe56b1c2 100644 --- a/arch/x86/kernel/cpu/powerflags.c +++ b/arch/x86/kernel/cpu/powerflags.c @@ -16,5 +16,6 @@ const char *const x86_power_flags[32] = { "100mhzsteps", "hwpstate", "", /* tsc invariant mapped to constant_tsc */ - /* nothing */ + "cpb", /* core performance boost */ + "eff_freq_ro", /* Readonly aperf/mperf */ }; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 14b23140e81..8022c668148 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; - unsigned int cpu = 0; + unsigned int cpu; int i; -#ifdef CONFIG_SMP cpu = c->cpu_index; -#endif seq_printf(m, "processor\t: %u\n" "vendor_id\t: %s\n" "cpu family\t: %d\n" diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 303a0e48f07..8071e2f3d6e 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -738,35 +738,17 @@ core_initcall(e820_mark_nvs_memory); /* * pre allocated 4k and reserved it in memblock and e820_saved */ -u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) +u64 __init early_reserve_e820(u64 size, u64 align) { - u64 size = 0; u64 addr; - u64 start; - for (start = startt; ; start += size) { - start = memblock_x86_find_in_range_size(start, &size, align); - if (start == MEMBLOCK_ERROR) - return 0; - if (size >= sizet) - break; + addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); + if (addr) { + e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); + printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); + update_e820_saved(); } -#ifdef CONFIG_X86_32 - if (start >= MAXMEM) - return 0; - if (start + size > MAXMEM) - size = MAXMEM - start; -#endif - - addr = round_down(start + size - sizet, align); - if (addr < start) - return 0; - memblock_x86_reserve_range(addr, addr + sizet, "new next"); - e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); - printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); - update_e820_saved(); - return addr; } @@ -1090,7 +1072,7 @@ void __init memblock_x86_fill(void) * We are safe to enable resizing, beause memblock_x86_fill() * is rather later for x86 */ - memblock_can_resize = 1; + memblock_allow_resize(); for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; @@ -1105,22 +1087,36 @@ void __init memblock_x86_fill(void) memblock_add(ei->addr, ei->size); } - memblock_analyze(); memblock_dump_all(); } void __init memblock_find_dma_reserve(void) { #ifdef CONFIG_X86_64 - u64 free_size_pfn; - u64 mem_size_pfn; + u64 nr_pages = 0, nr_free_pages = 0; + unsigned long start_pfn, end_pfn; + phys_addr_t start, end; + int i; + u64 u; + /* * need to find out used area below MAX_DMA_PFN * need to use memblock to get free size in [0, MAX_DMA_PFN] * at first, and assume boot_mem will not take below MAX_DMA_PFN */ - mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; - free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; - set_dma_reserve(mem_size_pfn - free_size_pfn); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN); + end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN); + nr_pages += end_pfn - start_pfn; + } + + for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { + start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); + end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); + if (start_pfn < end_pfn) + nr_free_pages += end_pfn - start_pfn; + } + + set_dma_reserve(nr_pages - nr_free_pages); #endif } diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f3f6f534400..22d0e21b4dd 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -625,6 +625,8 @@ work_notifysig: # deal with pending signals and movl %esp, %eax jne work_notifysig_v86 # returning to kernel-space or # vm86-space + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) xorl %edx, %edx call do_notify_resume jmp resume_userspace_sig @@ -638,6 +640,8 @@ work_notifysig_v86: #else movl %esp, %eax #endif + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) xorl %edx, %edx call do_notify_resume jmp resume_userspace_sig diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index faf8d5e74b0..a20e1cb9dc8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -221,7 +221,7 @@ ENDPROC(native_usergs_sysret64) /*CFI_REL_OFFSET ss,0*/ pushq_cfi %rax /* rsp */ CFI_REL_OFFSET rsp,0 - pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */ + pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */ /*CFI_REL_OFFSET rflags,0*/ pushq_cfi $__KERNEL_CS /* cs */ /*CFI_REL_OFFSET cs,0*/ @@ -411,7 +411,7 @@ ENTRY(ret_from_fork) RESTORE_REST testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? - je int_ret_from_sys_call + jz retint_restore_args testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET jnz int_ret_from_sys_call @@ -465,7 +465,7 @@ ENTRY(system_call) * after the swapgs, so that it can do the swapgs * for the guest and jump here on syscall. */ -ENTRY(system_call_after_swapgs) +GLOBAL(system_call_after_swapgs) movq %rsp,PER_CPU_VAR(old_rsp) movq PER_CPU_VAR(kernel_stack),%rsp @@ -478,8 +478,7 @@ ENTRY(system_call_after_swapgs) movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET - GET_THREAD_INFO(%rcx) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz tracesys system_call_fastpath: cmpq $__NR_syscall_max,%rax @@ -496,10 +495,9 @@ ret_from_sys_call: /* edi: flagmask */ sysret_check: LOCKDEP_SYS_EXIT - GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - movl TI_flags(%rcx),%edx + movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx andl %edi,%edx jnz sysret_careful CFI_REMEMBER_STATE @@ -583,7 +581,7 @@ sysret_audit: /* Do syscall tracing */ tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jz auditsys #endif SAVE_REST @@ -612,8 +610,6 @@ tracesys: GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $3,CS-ARGOFFSET(%rsp) - je retint_restore_args movl $_TIF_ALLWORK_MASK,%edi /* edi: mask to check */ GLOBAL(int_with_check) @@ -953,6 +949,7 @@ END(common_interrupt) ENTRY(\sym) INTR_FRAME pushq_cfi $~(\num) +.Lcommon_\sym: interrupt \do_sym jmp ret_from_intr CFI_ENDPROC @@ -976,13 +973,21 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP -.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ + ALIGN + INTR_FRAME +.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 .if NUM_INVALIDATE_TLB_VECTORS > \idx -apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ - invalidate_interrupt\idx smp_invalidate_interrupt +ENTRY(invalidate_interrupt\idx) + pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx) + jmp .Lcommon_invalidate_interrupt0 + CFI_ADJUST_CFA_OFFSET -8 +END(invalidate_interrupt\idx) .endif .endr + CFI_ENDPROC +apicinterrupt INVALIDATE_TLB_VECTOR_START, \ + invalidate_interrupt0, smp_invalidate_interrupt #endif apicinterrupt THRESHOLD_APIC_VECTOR \ diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index af0699ba48c..48d9d4ea102 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -52,5 +52,5 @@ void __init reserve_ebda_region(void) lowmem = 0x9f000; /* reserve all memory between lowmem and the 1MB mark */ - memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved"); + memblock_reserve(lowmem, 0x100000 - lowmem); } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 3bb08509a7a..51ff18616d5 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -31,9 +31,8 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { - memblock_init(); - - memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + memblock_reserve(__pa_symbol(&_text), + __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ @@ -42,7 +41,7 @@ void __init i386_start_kernel(void) u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); + memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); } #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5655c2272ad..3a3b779f41d 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -98,9 +98,8 @@ void __init x86_64_start_reservations(char *real_mode_data) { copy_bootdata(__va(real_mode_data)); - memblock_init(); - - memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + memblock_reserve(__pa_symbol(&_text), + __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ @@ -109,7 +108,7 @@ void __init x86_64_start_reservations(char *real_mode_data) unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); + memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); } #endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 1bb0bf4d92c..07b0a56a754 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -32,8 +32,6 @@ #define HPET_MIN_CYCLES 128 #define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) -#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) - /* * HPET address is set in acpi/boot.c, when an ACPI entry exists */ @@ -55,6 +53,11 @@ struct hpet_dev { char name[10]; }; +inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev) +{ + return container_of(evtdev, struct hpet_dev, evt); +} + inline unsigned int hpet_readl(unsigned int a) { return readl(hpet_virt_address + a); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 5d31e5bdbf8..7943e0c21bd 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -74,6 +74,10 @@ int arch_show_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); seq_printf(p, " IRQ work interrupts\n"); + seq_printf(p, "%*s: ", prec, "RTR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); + seq_printf(p, " APIC ICR read retries\n"); #endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); @@ -136,6 +140,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_irq_work_irqs; + sum += irq_stats(cpu)->icr_read_retry_count; #endif if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index ea9d5f2f13e..2889b3d4388 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -50,7 +50,7 @@ void arch_jump_label_transform(struct jump_entry *entry, put_online_cpus(); } -void arch_jump_label_transform_static(struct jump_entry *entry, +__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { __jump_label_transform(entry, type, text_poke_early); diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index d494799aafc..fe86493f3ed 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -1,14 +1,18 @@ /* * AMD CPU Microcode Update Driver for Linux - * Copyright (C) 2008 Advanced Micro Devices Inc. + * Copyright (C) 2008-2011 Advanced Micro Devices Inc. * * Author: Peter Oruba <peter.oruba@amd.com> * * Based on work by: * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> * - * This driver allows to upgrade microcode on AMD - * family 0x10 and 0x11 processors. + * Maintainers: + * Andreas Herrmann <andreas.herrmann3@amd.com> + * Borislav Petkov <borislav.petkov@amd.com> + * + * This driver allows to upgrade microcode on F10h AMD + * CPUs and later. * * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. @@ -71,6 +75,9 @@ struct microcode_amd { static struct equiv_cpu_entry *equiv_cpu_table; +/* page-sized ucode patch buffer */ +void *patch; + static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -86,27 +93,76 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) return 0; } -static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, - int rev) +static unsigned int verify_ucode_size(int cpu, u32 patch_size, + unsigned int size) { - unsigned int current_cpu_id; - u16 equiv_cpu_id = 0; - unsigned int i = 0; + struct cpuinfo_x86 *c = &cpu_data(cpu); + u32 max_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 + + switch (c->x86) { + case 0x14: + max_size = F14H_MPB_MAX_SIZE; + break; + case 0x15: + max_size = F15H_MPB_MAX_SIZE; + break; + default: + max_size = F1XH_MPB_MAX_SIZE; + break; + } + + if (patch_size > min_t(u32, size, max_size)) { + pr_err("patch size mismatch\n"); + return 0; + } + + return patch_size; +} + +static u16 find_equiv_id(void) +{ + unsigned int current_cpu_id, i = 0; BUG_ON(equiv_cpu_table == NULL); + current_cpu_id = cpuid_eax(0x00000001); while (equiv_cpu_table[i].installed_cpu != 0) { - if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { - equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; - break; - } + if (current_cpu_id == equiv_cpu_table[i].installed_cpu) + return equiv_cpu_table[i].equiv_cpu; + i++; } + return 0; +} +/* + * we signal a good patch is found by returning its size > 0 + */ +static int get_matching_microcode(int cpu, const u8 *ucode_ptr, + unsigned int leftover_size, int rev, + unsigned int *current_size) +{ + struct microcode_header_amd *mc_hdr; + unsigned int actual_size; + u16 equiv_cpu_id; + + /* size of the current patch we're staring at */ + *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE; + + equiv_cpu_id = find_equiv_id(); if (!equiv_cpu_id) return 0; + /* + * let's look at the patch header itself now + */ + mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE); + if (mc_hdr->processor_rev_id != equiv_cpu_id) return 0; @@ -120,7 +176,20 @@ static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, if (mc_hdr->patch_id <= rev) return 0; - return 1; + /* + * now that the header looks sane, verify its size + */ + actual_size = verify_ucode_size(cpu, *current_size, leftover_size); + if (!actual_size) + return 0; + + /* clear the patch buffer */ + memset(patch, 0, PAGE_SIZE); + + /* all looks ok, get the binary patch */ + get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size); + + return actual_size; } static int apply_microcode_amd(int cpu) @@ -155,63 +224,6 @@ static int apply_microcode_amd(int cpu) return 0; } -static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - u32 max_size, actual_size; - -#define F1XH_MPB_MAX_SIZE 2048 -#define F14H_MPB_MAX_SIZE 1824 -#define F15H_MPB_MAX_SIZE 4096 - - switch (c->x86) { - case 0x14: - max_size = F14H_MPB_MAX_SIZE; - break; - case 0x15: - max_size = F15H_MPB_MAX_SIZE; - break; - default: - max_size = F1XH_MPB_MAX_SIZE; - break; - } - - actual_size = *(u32 *)(buf + 4); - - if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) { - pr_err("section size mismatch\n"); - return 0; - } - - return actual_size; -} - -static struct microcode_header_amd * -get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size) -{ - struct microcode_header_amd *mc = NULL; - unsigned int actual_size = 0; - - if (*(u32 *)buf != UCODE_UCODE_TYPE) { - pr_err("invalid type field in container file section header\n"); - goto out; - } - - actual_size = verify_ucode_size(cpu, buf, size); - if (!actual_size) - goto out; - - mc = vzalloc(actual_size); - if (!mc) - goto out; - - get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size); - *mc_size = actual_size + SECTION_HDR_SIZE; - -out: - return mc; -} - static int install_equiv_cpu_table(const u8 *buf) { unsigned int *ibuf = (unsigned int *)buf; @@ -247,36 +259,38 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct microcode_header_amd *mc_hdr = NULL; - unsigned int mc_size, leftover; + unsigned int mc_size, leftover, current_size = 0; int offset; const u8 *ucode_ptr = data; void *new_mc = NULL; unsigned int new_rev = uci->cpu_sig.rev; - enum ucode_state state = UCODE_OK; + enum ucode_state state = UCODE_ERROR; offset = install_equiv_cpu_table(ucode_ptr); if (offset < 0) { pr_err("failed to create equivalent cpu table\n"); - return UCODE_ERROR; + goto out; } - ucode_ptr += offset; leftover = size - offset; - while (leftover) { - mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size); - if (!mc_hdr) - break; + if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) { + pr_err("invalid type field in container file section header\n"); + goto free_table; + } - if (get_matching_microcode(cpu, mc_hdr, new_rev)) { - vfree(new_mc); + while (leftover) { + mc_size = get_matching_microcode(cpu, ucode_ptr, leftover, + new_rev, ¤t_size); + if (mc_size) { + mc_hdr = patch; + new_mc = patch; new_rev = mc_hdr->patch_id; - new_mc = mc_hdr; - } else - vfree(mc_hdr); + goto out_ok; + } - ucode_ptr += mc_size; - leftover -= mc_size; + ucode_ptr += current_size; + leftover -= current_size; } if (!new_mc) { @@ -284,19 +298,16 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) goto free_table; } - if (!leftover) { - vfree(uci->mc); - uci->mc = new_mc; - pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", - cpu, uci->cpu_sig.rev, new_rev); - } else { - vfree(new_mc); - state = UCODE_ERROR; - } +out_ok: + uci->mc = new_mc; + state = UCODE_OK; + pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", + cpu, uci->cpu_sig.rev, new_rev); free_table: free_equiv_cpu_table(); +out: return state; } @@ -337,7 +348,6 @@ static void microcode_fini_cpu_amd(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - vfree(uci->mc); uci->mc = NULL; } @@ -351,5 +361,14 @@ static struct microcode_ops microcode_amd_ops = { struct microcode_ops * __init init_amd_microcode(void) { + patch = (void *)get_zeroed_page(GFP_KERNEL); + if (!patch) + return NULL; + return µcode_amd_ops; } + +void __exit exit_amd_microcode(void) +{ + free_page((unsigned long)patch); +} diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9d46f5e43b5..9302e2d0eb4 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -563,6 +563,8 @@ module_init(microcode_init); static void __exit microcode_exit(void) { + struct cpuinfo_x86 *c = &cpu_data(0); + microcode_dev_exit(); unregister_hotcpu_notifier(&mc_cpu_notifier); @@ -580,6 +582,9 @@ static void __exit microcode_exit(void) microcode_ops = NULL; + if (c->x86_vendor == X86_VENDOR_AMD) + exit_amd_microcode(); + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); } module_exit(microcode_exit); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0741b062a30..ca470e4c92d 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -564,9 +564,7 @@ void __init default_get_smp_config(unsigned int early) static void __init smp_reserve_memory(struct mpf_intel *mpf) { - unsigned long size = get_mpc_size(mpf->physptr); - - memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc"); + memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr)); } static int __init smp_scan_config(unsigned long base, unsigned long length) @@ -595,7 +593,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) mpf, (u64)virt_to_phys(mpf)); mem = virt_to_phys(mpf); - memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf"); + memblock_reserve(mem, sizeof(*mpf)); if (mpf->physptr) smp_reserve_memory(mpf); @@ -836,10 +834,8 @@ early_param("alloc_mptable", parse_alloc_mptable_opt); void __init early_reserve_e820_mpc_new(void) { - if (enable_update_mptable && alloc_mptable) { - u64 startt = 0; - mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); - } + if (enable_update_mptable && alloc_mptable) + mpc_new_phys = early_reserve_e820(mpc_new_length, 4); } static int __init update_mp_table(void) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ee5d4fbd53b..15763af7bfe 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -293,7 +293,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.orig_ax = -1; regs.ip = (unsigned long) kernel_thread_helper; regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | 0x2; + regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1; /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 64e926c89a6..9b9fe4a85c8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -299,13 +299,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { - p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, + IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; return -ENOMEM; } - memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, - IO_BITMAP_BYTES); set_tsk_thread_flag(p, TIF_IO_BITMAP); } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 82528799c5d..89a04c7b5bb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -749,7 +749,8 @@ put: /* * Handle PTRACE_POKEUSR calls for the debug register area. */ -int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) +static int ptrace_set_debugreg(struct task_struct *tsk, int n, + unsigned long val) { struct thread_struct *thread = &(tsk->thread); int rc = 0; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cf0ef986cb6..d05444ac2ae 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -306,7 +306,8 @@ static void __init cleanup_highmap(void) static void __init reserve_brk(void) { if (_brk_end > _brk_start) - memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK"); + memblock_reserve(__pa(_brk_start), + __pa(_brk_end) - __pa(_brk_start)); /* Mark brk area as locked down and no longer taking any new allocations */ @@ -331,13 +332,13 @@ static void __init relocate_initrd(void) ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, PAGE_SIZE); - if (ramdisk_here == MEMBLOCK_ERROR) + if (!ramdisk_here) panic("Cannot find place for new RAMDISK of size %lld\n", ramdisk_size); /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); + memblock_reserve(ramdisk_here, area_size); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", @@ -393,7 +394,7 @@ static void __init reserve_initrd(void) initrd_start = 0; if (ramdisk_size >= (end_of_lowmem>>1)) { - memblock_x86_free_range(ramdisk_image, ramdisk_end); + memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); printk(KERN_ERR "initrd too large to handle, " "disabling initrd\n"); return; @@ -416,7 +417,7 @@ static void __init reserve_initrd(void) relocate_initrd(); - memblock_x86_free_range(ramdisk_image, ramdisk_end); + memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); } #else static void __init reserve_initrd(void) @@ -490,15 +491,13 @@ static void __init memblock_x86_reserve_range_setup_data(void) { struct setup_data *data; u64 pa_data; - char buf[32]; if (boot_params.hdr.version < 0x0209) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); - sprintf(buf, "setup data %x", data->type); - memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf); + memblock_reserve(pa_data, sizeof(*data) + data->len); pa_data = data->next; early_iounmap(data, sizeof(*data)); } @@ -554,7 +553,7 @@ static void __init reserve_crashkernel(void) crash_base = memblock_find_in_range(alignment, CRASH_KERNEL_ADDR_MAX, crash_size, alignment); - if (crash_base == MEMBLOCK_ERROR) { + if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } @@ -568,7 +567,7 @@ static void __init reserve_crashkernel(void) return; } } - memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL"); + memblock_reserve(crash_base, crash_size); printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " "for crashkernel (System RAM: %ldMB)\n", @@ -626,7 +625,7 @@ static __init void reserve_ibft_region(void) addr = find_ibft_region(&size); if (size) - memblock_x86_reserve_range(addr, addr + size, "* ibft"); + memblock_reserve(addr, size); } static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9f548cb4a95..e38e21754ee 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -840,7 +840,8 @@ int __cpuinit native_cpu_up(unsigned int cpu) pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || - !physid_isset(apicid, phys_cpu_present_map)) { + !physid_isset(apicid, phys_cpu_present_map) || + (!x2apic_mode && apicid >= 255)) { printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index a91ae7709b4..a73b61055ad 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -14,11 +14,11 @@ void __init setup_trampolines(void) /* Has to be in very low memory so we can execute real-mode AP code. */ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); - if (mem == MEMBLOCK_ERROR) + if (!mem) panic("Cannot allocate trampoline\n"); x86_trampoline_base = __va(mem); - memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE"); + memblock_reserve(mem, size); printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", x86_trampoline_base, (unsigned long long)mem, size); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a8e3eb83466..fa1191fb679 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -306,15 +306,10 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) == NOTIFY_STOP) return; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ -#ifdef CONFIG_KPROBES + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; -#else - if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return; -#endif preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index db483369f10..2c9cf0fd78f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -35,7 +35,7 @@ static int __read_mostly tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int __read_mostly tsc_disabled = -1; -static int tsc_clocksource_reliable; +int tsc_clocksource_reliable; /* * Scheduler clock - returns current time in nanosec units. */ @@ -178,11 +178,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) } #define CAL_MS 10 -#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS)) +#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS)) #define CAL_PIT_LOOPS 1000 #define CAL2_MS 50 -#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS)) +#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS)) #define CAL2_PIT_LOOPS 5000 diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 0aa5fed8b9e..9eba29b46cb 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -113,7 +113,7 @@ void __cpuinit check_tsc_sync_source(int cpu) if (unsynchronized_tsc()) return; - if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + if (tsc_clocksource_reliable) { if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) pr_info( "Skipped synchronization checks as TSC is reliable.\n"); @@ -172,7 +172,7 @@ void __cpuinit check_tsc_sync_target(void) { int cpus = 2; - if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) + if (unsynchronized_tsc() || tsc_clocksource_reliable) return; /* diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index e4d4a22e8b9..b07ba939356 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -57,7 +57,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), }; -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; static int __init vsyscall_setup(char *str) { @@ -140,11 +140,40 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } +static bool write_ok_or_segv(unsigned long ptr, size_t size) +{ + /* + * XXX: if access_ok, get_user, and put_user handled + * sig_on_uaccess_error, this could go away. + */ + + if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { + siginfo_t info; + struct thread_struct *thread = ¤t->thread; + + thread->error_code = 6; /* user fault, no page, write */ + thread->cr2 = ptr; + thread->trap_no = 14; + + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = (void __user *)ptr; + + force_sig_info(SIGSEGV, &info, current); + return false; + } else { + return true; + } +} + bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; int vsyscall_nr; + int prev_sig_on_uaccess_error; long ret; /* @@ -180,35 +209,65 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) if (seccomp_mode(&tsk->seccomp)) do_exit(SIGKILL); + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; + current_thread_info()->sig_on_uaccess_error = 1; + + /* + * 0 is a valid user pointer (in the access_ok sense) on 32-bit and + * 64-bit, so we don't need to special-case it here. For all the + * vsyscalls, 0 means "don't write anything" not "write it at + * address 0". + */ + ret = -EFAULT; switch (vsyscall_nr) { case 0: + if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || + !write_ok_or_segv(regs->si, sizeof(struct timezone))) + break; + ret = sys_gettimeofday( (struct timeval __user *)regs->di, (struct timezone __user *)regs->si); break; case 1: + if (!write_ok_or_segv(regs->di, sizeof(time_t))) + break; + ret = sys_time((time_t __user *)regs->di); break; case 2: + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) + break; + ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, 0); break; } + current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; + if (ret == -EFAULT) { - /* - * Bad news -- userspace fed a bad pointer to a vsyscall. - * - * With a real vsyscall, that would have caused SIGSEGV. - * To make writing reliable exploits using the emulated - * vsyscalls harder, generate SIGSEGV here as well. - */ + /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall fault (exploit attempt?)"); - goto sigsegv; + + /* + * If we failed to generate a signal for any reason, + * generate one here. (This should be impossible.) + */ + if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && + !sigismember(&tsk->pending.signal, SIGSEGV))) + goto sigsegv; + + return true; /* Don't emulate the ret. */ } regs->ax = ret; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c1d6cd54939..91f83e21b98 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -92,6 +92,7 @@ struct x86_init_ops x86_init __initdata = { struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { .setup_percpu_clockev = setup_secondary_APIC_clock, + .fixup_cpu_id = x86_default_fixup_cpu_id, }; static void default_nmi_init(void) { }; diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c index 46fc4ee09fc..88ad5fbda6e 100644 --- a/arch/x86/lib/inat.c +++ b/arch/x86/lib/inat.c @@ -82,9 +82,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, const insn_attr_t *table; if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) return 0; - table = inat_avx_tables[vex_m][vex_p]; + /* At first, this checks the master table */ + table = inat_avx_tables[vex_m][0]; if (!table) return 0; + if (!inat_is_group(table[opcode]) && vex_p) { + /* If this is not a group, get attribute directly */ + table = inat_avx_tables[vex_m][vex_p]; + if (!table) + return 0; + } return table[opcode]; } diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 374562ed670..5a1f9f3e3fb 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -202,7 +202,7 @@ void insn_get_opcode(struct insn *insn) m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); - if (!inat_accept_vex(insn->attr)) + if (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr)) insn->attr = 0; /* This instruction is bad */ goto end; /* VEX has only 1 byte for opcode */ } @@ -249,6 +249,8 @@ void insn_get_modrm(struct insn *insn) pfx = insn_last_prefix(insn); insn->attr = inat_get_group_attribute(mod, pfx, insn->attr); + if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) + insn->attr = 0; /* This is bad */ } } diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c index 82004d2bf05..bd59090825d 100644 --- a/arch/x86/lib/string_32.c +++ b/arch/x86/lib/string_32.c @@ -164,15 +164,13 @@ EXPORT_SYMBOL(strchr); size_t strlen(const char *s) { int d0; - int res; + size_t res; asm volatile("repne\n\t" - "scasb\n\t" - "notl %0\n\t" - "decl %0" + "scasb" : "=c" (res), "=&D" (d0) : "1" (s), "a" (0), "0" (0xffffffffu) : "memory"); - return res; + return ~res - 1; } EXPORT_SYMBOL(strlen); #endif diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index a793da5e560..5b83c51c12e 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -1,5 +1,11 @@ # x86 Opcode Maps # +# This is (mostly) based on following documentations. +# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2 +# (#325383-040US, October 2011) +# - Intel(R) Advanced Vector Extensions Programming Reference +# (#319433-011,JUNE 2011). +# #<Opcode maps> # Table: table-name # Referrer: escaped-name @@ -15,10 +21,13 @@ # EndTable # # AVX Superscripts -# (VEX): this opcode can accept VEX prefix. -# (oVEX): this opcode requires VEX prefix. -# (o128): this opcode only supports 128bit VEX. -# (o256): this opcode only supports 256bit VEX. +# (v): this opcode requires VEX prefix. +# (v1): this opcode only supports 128bit VEX. +# +# Last Prefix Superscripts +# - (66): the last prefix is 0x66 +# - (F3): the last prefix is 0xF3 +# - (F2): the last prefix is 0xF2 # Table: one byte opcode @@ -199,8 +208,8 @@ a0: MOV AL,Ob a1: MOV rAX,Ov a2: MOV Ob,AL a3: MOV Ov,rAX -a4: MOVS/B Xb,Yb -a5: MOVS/W/D/Q Xv,Yv +a4: MOVS/B Yb,Xb +a5: MOVS/W/D/Q Yv,Xv a6: CMPS/B Xb,Yb a7: CMPS/W/D Xv,Yv a8: TEST AL,Ib @@ -233,8 +242,8 @@ c0: Grp2 Eb,Ib (1A) c1: Grp2 Ev,Ib (1A) c2: RETN Iw (f64) c3: RETN -c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix) -c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix) +c4: LES Gz,Mp (i64) | VEX+2byte (Prefix) +c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix) c6: Grp11 Eb,Ib (1A) c7: Grp11 Ev,Iz (1A) c8: ENTER Iw,Ib @@ -320,14 +329,19 @@ AVXcode: 1 # 3DNow! uses the last imm byte as opcode extension. 0f: 3DNow! Pq,Qq,Ib # 0x0f 0x10-0x1f -10: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128) -11: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128) -12: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX) -13: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128) -14: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX) -15: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX) -16: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX) -17: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128) +# NOTE: According to Intel SDM opcode map, vmovups and vmovupd has no operands +# but it actually has operands. And also, vmovss and vmovsd only accept 128bit. +# MOVSS/MOVSD has too many forms(3) on SDM. This map just shows a typical form. +# Many AVX instructions lack v1 superscript, according to Intel AVX-Prgramming +# Reference A.1 +10: vmovups Vps,Wps | vmovupd Vpd,Wpd (66) | vmovss Vx,Hx,Wss (F3),(v1) | vmovsd Vx,Hx,Wsd (F2),(v1) +11: vmovups Wps,Vps | vmovupd Wpd,Vpd (66) | vmovss Wss,Hx,Vss (F3),(v1) | vmovsd Wsd,Hx,Vsd (F2),(v1) +12: vmovlps Vq,Hq,Mq (v1) | vmovhlps Vq,Hq,Uq (v1) | vmovlpd Vq,Hq,Mq (66),(v1) | vmovsldup Vx,Wx (F3) | vmovddup Vx,Wx (F2) +13: vmovlps Mq,Vq (v1) | vmovlpd Mq,Vq (66),(v1) +14: vunpcklps Vx,Hx,Wx | vunpcklpd Vx,Hx,Wx (66) +15: vunpckhps Vx,Hx,Wx | vunpckhpd Vx,Hx,Wx (66) +16: vmovhps Vdq,Hq,Mq (v1) | vmovlhps Vdq,Hq,Uq (v1) | vmovhpd Vdq,Hq,Mq (66),(v1) | vmovshdup Vx,Wx (F3) +17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1) 18: Grp16 (1A) 19: 1a: @@ -345,14 +359,14 @@ AVXcode: 1 25: 26: 27: -28: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX) -29: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX) -2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128) -2b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX) -2c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128) -2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128) -2e: ucomiss Vss,Wss (VEX),(o128) | ucomisd Vsd,Wsd (66),(VEX),(o128) -2f: comiss Vss,Wss (VEX),(o128) | comisd Vsd,Wsd (66),(VEX),(o128) +28: vmovaps Vps,Wps | vmovapd Vpd,Wpd (66) +29: vmovaps Wps,Vps | vmovapd Wpd,Vpd (66) +2a: cvtpi2ps Vps,Qpi | cvtpi2pd Vpd,Qpi (66) | vcvtsi2ss Vss,Hss,Ey (F3),(v1) | vcvtsi2sd Vsd,Hsd,Ey (F2),(v1) +2b: vmovntps Mps,Vps | vmovntpd Mpd,Vpd (66) +2c: cvttps2pi Ppi,Wps | cvttpd2pi Ppi,Wpd (66) | vcvttss2si Gy,Wss (F3),(v1) | vcvttsd2si Gy,Wsd (F2),(v1) +2d: cvtps2pi Ppi,Wps | cvtpd2pi Qpi,Wpd (66) | vcvtss2si Gy,Wss (F3),(v1) | vcvtsd2si Gy,Wsd (F2),(v1) +2e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1) +2f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1) # 0x0f 0x30-0x3f 30: WRMSR 31: RDTSC @@ -388,65 +402,66 @@ AVXcode: 1 4e: CMOVLE/NG Gv,Ev 4f: CMOVNLE/G Gv,Ev # 0x0f 0x50-0x5f -50: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX) -51: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128) -52: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128) -53: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128) -54: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX) -55: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX) -56: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX) -57: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX) -58: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128) -59: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128) -5a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128) -5b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX) -5c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128) -5d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128) -5e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128) -5f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128) +50: vmovmskps Gy,Ups | vmovmskpd Gy,Upd (66) +51: vsqrtps Vps,Wps | vsqrtpd Vpd,Wpd (66) | vsqrtss Vss,Hss,Wss (F3),(v1) | vsqrtsd Vsd,Hsd,Wsd (F2),(v1) +52: vrsqrtps Vps,Wps | vrsqrtss Vss,Hss,Wss (F3),(v1) +53: vrcpps Vps,Wps | vrcpss Vss,Hss,Wss (F3),(v1) +54: vandps Vps,Hps,Wps | vandpd Vpd,Hpd,Wpd (66) +55: vandnps Vps,Hps,Wps | vandnpd Vpd,Hpd,Wpd (66) +56: vorps Vps,Hps,Wps | vorpd Vpd,Hpd,Wpd (66) +57: vxorps Vps,Hps,Wps | vxorpd Vpd,Hpd,Wpd (66) +58: vaddps Vps,Hps,Wps | vaddpd Vpd,Hpd,Wpd (66) | vaddss Vss,Hss,Wss (F3),(v1) | vaddsd Vsd,Hsd,Wsd (F2),(v1) +59: vmulps Vps,Hps,Wps | vmulpd Vpd,Hpd,Wpd (66) | vmulss Vss,Hss,Wss (F3),(v1) | vmulsd Vsd,Hsd,Wsd (F2),(v1) +5a: vcvtps2pd Vpd,Wps | vcvtpd2ps Vps,Wpd (66) | vcvtss2sd Vsd,Hx,Wss (F3),(v1) | vcvtsd2ss Vss,Hx,Wsd (F2),(v1) +5b: vcvtdq2ps Vps,Wdq | vcvtps2dq Vdq,Wps (66) | vcvttps2dq Vdq,Wps (F3) +5c: vsubps Vps,Hps,Wps | vsubpd Vpd,Hpd,Wpd (66) | vsubss Vss,Hss,Wss (F3),(v1) | vsubsd Vsd,Hsd,Wsd (F2),(v1) +5d: vminps Vps,Hps,Wps | vminpd Vpd,Hpd,Wpd (66) | vminss Vss,Hss,Wss (F3),(v1) | vminsd Vsd,Hsd,Wsd (F2),(v1) +5e: vdivps Vps,Hps,Wps | vdivpd Vpd,Hpd,Wpd (66) | vdivss Vss,Hss,Wss (F3),(v1) | vdivsd Vsd,Hsd,Wsd (F2),(v1) +5f: vmaxps Vps,Hps,Wps | vmaxpd Vpd,Hpd,Wpd (66) | vmaxss Vss,Hss,Wss (F3),(v1) | vmaxsd Vsd,Hsd,Wsd (F2),(v1) # 0x0f 0x60-0x6f -60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128) -61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128) -62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128) -63: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128) -64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128) -65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128) -66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128) -67: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128) -68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128) -69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128) -6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128) -6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128) -6c: punpcklqdq Vdq,Wdq (66),(VEX),(o128) -6d: punpckhqdq Vdq,Wdq (66),(VEX),(o128) -6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128) -6f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX) +60: punpcklbw Pq,Qd | vpunpcklbw Vx,Hx,Wx (66),(v1) +61: punpcklwd Pq,Qd | vpunpcklwd Vx,Hx,Wx (66),(v1) +62: punpckldq Pq,Qd | vpunpckldq Vx,Hx,Wx (66),(v1) +63: packsswb Pq,Qq | vpacksswb Vx,Hx,Wx (66),(v1) +64: pcmpgtb Pq,Qq | vpcmpgtb Vx,Hx,Wx (66),(v1) +65: pcmpgtw Pq,Qq | vpcmpgtw Vx,Hx,Wx (66),(v1) +66: pcmpgtd Pq,Qq | vpcmpgtd Vx,Hx,Wx (66),(v1) +67: packuswb Pq,Qq | vpackuswb Vx,Hx,Wx (66),(v1) +68: punpckhbw Pq,Qd | vpunpckhbw Vx,Hx,Wx (66),(v1) +69: punpckhwd Pq,Qd | vpunpckhwd Vx,Hx,Wx (66),(v1) +6a: punpckhdq Pq,Qd | vpunpckhdq Vx,Hx,Wx (66),(v1) +6b: packssdw Pq,Qd | vpackssdw Vx,Hx,Wx (66),(v1) +6c: vpunpcklqdq Vx,Hx,Wx (66),(v1) +6d: vpunpckhqdq Vx,Hx,Wx (66),(v1) +6e: movd/q Pd,Ey | vmovd/q Vy,Ey (66),(v1) +6f: movq Pq,Qq | vmovdqa Vx,Wx (66) | vmovdqu Vx,Wx (F3) # 0x0f 0x70-0x7f -70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128) +70: pshufw Pq,Qq,Ib | vpshufd Vx,Wx,Ib (66),(v1) | vpshufhw Vx,Wx,Ib (F3),(v1) | vpshuflw Vx,Wx,Ib (F2),(v1) 71: Grp12 (1A) 72: Grp13 (1A) 73: Grp14 (1A) -74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128) -75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128) -76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128) -77: emms/vzeroupper/vzeroall (VEX) -78: VMREAD Ed/q,Gd/q -79: VMWRITE Gd/q,Ed/q +74: pcmpeqb Pq,Qq | vpcmpeqb Vx,Hx,Wx (66),(v1) +75: pcmpeqw Pq,Qq | vpcmpeqw Vx,Hx,Wx (66),(v1) +76: pcmpeqd Pq,Qq | vpcmpeqd Vx,Hx,Wx (66),(v1) +# Note: Remove (v), because vzeroall and vzeroupper becomes emms without VEX. +77: emms | vzeroupper | vzeroall +78: VMREAD Ey,Gy +79: VMWRITE Gy,Ey 7a: 7b: -7c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX) -7d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX) -7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128) -7f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX) +7c: vhaddpd Vpd,Hpd,Wpd (66) | vhaddps Vps,Hps,Wps (F2) +7d: vhsubpd Vpd,Hpd,Wpd (66) | vhsubps Vps,Hps,Wps (F2) +7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) +7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) # 0x0f 0x80-0x8f 80: JO Jz (f64) 81: JNO Jz (f64) -82: JB/JNAE/JC Jz (f64) -83: JNB/JAE/JNC Jz (f64) -84: JZ/JE Jz (f64) -85: JNZ/JNE Jz (f64) +82: JB/JC/JNAE Jz (f64) +83: JAE/JNB/JNC Jz (f64) +84: JE/JZ Jz (f64) +85: JNE/JNZ Jz (f64) 86: JBE/JNA Jz (f64) -87: JNBE/JA Jz (f64) +87: JA/JNBE Jz (f64) 88: JS Jz (f64) 89: JNS Jz (f64) 8a: JP/JPE Jz (f64) @@ -502,18 +517,18 @@ b8: JMPE | POPCNT Gv,Ev (F3) b9: Grp10 (1A) ba: Grp8 Ev,Ib (1A) bb: BTC Ev,Gv -bc: BSF Gv,Ev -bd: BSR Gv,Ev +bc: BSF Gv,Ev | TZCNT Gv,Ev (F3) +bd: BSR Gv,Ev | LZCNT Gv,Ev (F3) be: MOVSX Gv,Eb bf: MOVSX Gv,Ew # 0x0f 0xc0-0xcf c0: XADD Eb,Gb c1: XADD Ev,Gv -c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX) -c3: movnti Md/q,Gd/q -c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128) -c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128) -c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX) +c2: vcmpps Vps,Hps,Wps,Ib | vcmppd Vpd,Hpd,Wpd,Ib (66) | vcmpss Vss,Hss,Wss,Ib (F3),(v1) | vcmpsd Vsd,Hsd,Wsd,Ib (F2),(v1) +c3: movnti My,Gy +c4: pinsrw Pq,Ry/Mw,Ib | vpinsrw Vdq,Hdq,Ry/Mw,Ib (66),(v1) +c5: pextrw Gd,Nq,Ib | vpextrw Gd,Udq,Ib (66),(v1) +c6: vshufps Vps,Hps,Wps,Ib | vshufpd Vpd,Hpd,Wpd,Ib (66) c7: Grp9 (1A) c8: BSWAP RAX/EAX/R8/R8D c9: BSWAP RCX/ECX/R9/R9D @@ -524,55 +539,55 @@ cd: BSWAP RBP/EBP/R13/R13D ce: BSWAP RSI/ESI/R14/R14D cf: BSWAP RDI/EDI/R15/R15D # 0x0f 0xd0-0xdf -d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX) -d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128) -d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128) -d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128) -d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128) -d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128) -d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2) -d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128) -d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128) -d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128) -da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128) -db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128) -dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128) -dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128) -de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128) -df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128) +d0: vaddsubpd Vpd,Hpd,Wpd (66) | vaddsubps Vps,Hps,Wps (F2) +d1: psrlw Pq,Qq | vpsrlw Vx,Hx,Wx (66),(v1) +d2: psrld Pq,Qq | vpsrld Vx,Hx,Wx (66),(v1) +d3: psrlq Pq,Qq | vpsrlq Vx,Hx,Wx (66),(v1) +d4: paddq Pq,Qq | vpaddq Vx,Hx,Wx (66),(v1) +d5: pmullw Pq,Qq | vpmullw Vx,Hx,Wx (66),(v1) +d6: vmovq Wq,Vq (66),(v1) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2) +d7: pmovmskb Gd,Nq | vpmovmskb Gd,Ux (66),(v1) +d8: psubusb Pq,Qq | vpsubusb Vx,Hx,Wx (66),(v1) +d9: psubusw Pq,Qq | vpsubusw Vx,Hx,Wx (66),(v1) +da: pminub Pq,Qq | vpminub Vx,Hx,Wx (66),(v1) +db: pand Pq,Qq | vpand Vx,Hx,Wx (66),(v1) +dc: paddusb Pq,Qq | vpaddusb Vx,Hx,Wx (66),(v1) +dd: paddusw Pq,Qq | vpaddusw Vx,Hx,Wx (66),(v1) +de: pmaxub Pq,Qq | vpmaxub Vx,Hx,Wx (66),(v1) +df: pandn Pq,Qq | vpandn Vx,Hx,Wx (66),(v1) # 0x0f 0xe0-0xef -e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128) -e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128) -e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128) -e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128) -e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128) -e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128) -e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX) -e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX) -e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128) -e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128) -ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128) -eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128) -ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128) -ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128) -ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128) -ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128) +e0: pavgb Pq,Qq | vpavgb Vx,Hx,Wx (66),(v1) +e1: psraw Pq,Qq | vpsraw Vx,Hx,Wx (66),(v1) +e2: psrad Pq,Qq | vpsrad Vx,Hx,Wx (66),(v1) +e3: pavgw Pq,Qq | vpavgw Vx,Hx,Wx (66),(v1) +e4: pmulhuw Pq,Qq | vpmulhuw Vx,Hx,Wx (66),(v1) +e5: pmulhw Pq,Qq | vpmulhw Vx,Hx,Wx (66),(v1) +e6: vcvttpd2dq Vx,Wpd (66) | vcvtdq2pd Vx,Wdq (F3) | vcvtpd2dq Vx,Wpd (F2) +e7: movntq Mq,Pq | vmovntdq Mx,Vx (66) +e8: psubsb Pq,Qq | vpsubsb Vx,Hx,Wx (66),(v1) +e9: psubsw Pq,Qq | vpsubsw Vx,Hx,Wx (66),(v1) +ea: pminsw Pq,Qq | vpminsw Vx,Hx,Wx (66),(v1) +eb: por Pq,Qq | vpor Vx,Hx,Wx (66),(v1) +ec: paddsb Pq,Qq | vpaddsb Vx,Hx,Wx (66),(v1) +ed: paddsw Pq,Qq | vpaddsw Vx,Hx,Wx (66),(v1) +ee: pmaxsw Pq,Qq | vpmaxsw Vx,Hx,Wx (66),(v1) +ef: pxor Pq,Qq | vpxor Vx,Hx,Wx (66),(v1) # 0x0f 0xf0-0xff -f0: lddqu Vdq,Mdq (F2),(VEX) -f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128) -f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128) -f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128) -f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128) -f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128) -f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128) -f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128) -f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128) -f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128) -fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128) -fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128) -fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128) -fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128) -fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128) +f0: vlddqu Vx,Mx (F2) +f1: psllw Pq,Qq | vpsllw Vx,Hx,Wx (66),(v1) +f2: pslld Pq,Qq | vpslld Vx,Hx,Wx (66),(v1) +f3: psllq Pq,Qq | vpsllq Vx,Hx,Wx (66),(v1) +f4: pmuludq Pq,Qq | vpmuludq Vx,Hx,Wx (66),(v1) +f5: pmaddwd Pq,Qq | vpmaddwd Vx,Hx,Wx (66),(v1) +f6: psadbw Pq,Qq | vpsadbw Vx,Hx,Wx (66),(v1) +f7: maskmovq Pq,Nq | vmaskmovdqu Vx,Ux (66),(v1) +f8: psubb Pq,Qq | vpsubb Vx,Hx,Wx (66),(v1) +f9: psubw Pq,Qq | vpsubw Vx,Hx,Wx (66),(v1) +fa: psubd Pq,Qq | vpsubd Vx,Hx,Wx (66),(v1) +fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) +fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) +fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) +fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) ff: EndTable @@ -580,155 +595,193 @@ Table: 3-byte opcode 1 (0x0f 0x38) Referrer: 3-byte escape 1 AVXcode: 2 # 0x0f 0x38 0x00-0x0f -00: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128) -01: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128) -02: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128) -03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128) -04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128) -05: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128) -06: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128) -07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128) -08: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128) -09: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128) -0a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128) -0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128) -0c: Vpermilps /r (66),(oVEX) -0d: Vpermilpd /r (66),(oVEX) -0e: vtestps /r (66),(oVEX) -0f: vtestpd /r (66),(oVEX) +00: pshufb Pq,Qq | vpshufb Vx,Hx,Wx (66),(v1) +01: phaddw Pq,Qq | vphaddw Vx,Hx,Wx (66),(v1) +02: phaddd Pq,Qq | vphaddd Vx,Hx,Wx (66),(v1) +03: phaddsw Pq,Qq | vphaddsw Vx,Hx,Wx (66),(v1) +04: pmaddubsw Pq,Qq | vpmaddubsw Vx,Hx,Wx (66),(v1) +05: phsubw Pq,Qq | vphsubw Vx,Hx,Wx (66),(v1) +06: phsubd Pq,Qq | vphsubd Vx,Hx,Wx (66),(v1) +07: phsubsw Pq,Qq | vphsubsw Vx,Hx,Wx (66),(v1) +08: psignb Pq,Qq | vpsignb Vx,Hx,Wx (66),(v1) +09: psignw Pq,Qq | vpsignw Vx,Hx,Wx (66),(v1) +0a: psignd Pq,Qq | vpsignd Vx,Hx,Wx (66),(v1) +0b: pmulhrsw Pq,Qq | vpmulhrsw Vx,Hx,Wx (66),(v1) +0c: vpermilps Vx,Hx,Wx (66),(v) +0d: vpermilpd Vx,Hx,Wx (66),(v) +0e: vtestps Vx,Wx (66),(v) +0f: vtestpd Vx,Wx (66),(v) # 0x0f 0x38 0x10-0x1f 10: pblendvb Vdq,Wdq (66) 11: 12: -13: +13: vcvtph2ps Vx,Wx,Ib (66),(v) 14: blendvps Vdq,Wdq (66) 15: blendvpd Vdq,Wdq (66) -16: -17: ptest Vdq,Wdq (66),(VEX) -18: vbroadcastss /r (66),(oVEX) -19: vbroadcastsd /r (66),(oVEX),(o256) -1a: vbroadcastf128 /r (66),(oVEX),(o256) +16: vpermps Vqq,Hqq,Wqq (66),(v) +17: vptest Vx,Wx (66) +18: vbroadcastss Vx,Wd (66),(v) +19: vbroadcastsd Vqq,Wq (66),(v) +1a: vbroadcastf128 Vqq,Mdq (66),(v) 1b: -1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128) -1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128) -1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128) +1c: pabsb Pq,Qq | vpabsb Vx,Wx (66),(v1) +1d: pabsw Pq,Qq | vpabsw Vx,Wx (66),(v1) +1e: pabsd Pq,Qq | vpabsd Vx,Wx (66),(v1) 1f: # 0x0f 0x38 0x20-0x2f -20: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128) -21: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128) -22: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128) -23: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128) -24: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128) -25: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128) +20: vpmovsxbw Vx,Ux/Mq (66),(v1) +21: vpmovsxbd Vx,Ux/Md (66),(v1) +22: vpmovsxbq Vx,Ux/Mw (66),(v1) +23: vpmovsxwd Vx,Ux/Mq (66),(v1) +24: vpmovsxwq Vx,Ux/Md (66),(v1) +25: vpmovsxdq Vx,Ux/Mq (66),(v1) 26: 27: -28: pmuldq Vdq,Wdq (66),(VEX),(o128) -29: pcmpeqq Vdq,Wdq (66),(VEX),(o128) -2a: movntdqa Vdq,Mdq (66),(VEX),(o128) -2b: packusdw Vdq,Wdq (66),(VEX),(o128) -2c: vmaskmovps(ld) /r (66),(oVEX) -2d: vmaskmovpd(ld) /r (66),(oVEX) -2e: vmaskmovps(st) /r (66),(oVEX) -2f: vmaskmovpd(st) /r (66),(oVEX) +28: vpmuldq Vx,Hx,Wx (66),(v1) +29: vpcmpeqq Vx,Hx,Wx (66),(v1) +2a: vmovntdqa Vx,Mx (66),(v1) +2b: vpackusdw Vx,Hx,Wx (66),(v1) +2c: vmaskmovps Vx,Hx,Mx (66),(v) +2d: vmaskmovpd Vx,Hx,Mx (66),(v) +2e: vmaskmovps Mx,Hx,Vx (66),(v) +2f: vmaskmovpd Mx,Hx,Vx (66),(v) # 0x0f 0x38 0x30-0x3f -30: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128) -31: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128) -32: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128) -33: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128) -34: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128) -35: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128) -36: -37: pcmpgtq Vdq,Wdq (66),(VEX),(o128) -38: pminsb Vdq,Wdq (66),(VEX),(o128) -39: pminsd Vdq,Wdq (66),(VEX),(o128) -3a: pminuw Vdq,Wdq (66),(VEX),(o128) -3b: pminud Vdq,Wdq (66),(VEX),(o128) -3c: pmaxsb Vdq,Wdq (66),(VEX),(o128) -3d: pmaxsd Vdq,Wdq (66),(VEX),(o128) -3e: pmaxuw Vdq,Wdq (66),(VEX),(o128) -3f: pmaxud Vdq,Wdq (66),(VEX),(o128) +30: vpmovzxbw Vx,Ux/Mq (66),(v1) +31: vpmovzxbd Vx,Ux/Md (66),(v1) +32: vpmovzxbq Vx,Ux/Mw (66),(v1) +33: vpmovzxwd Vx,Ux/Mq (66),(v1) +34: vpmovzxwq Vx,Ux/Md (66),(v1) +35: vpmovzxdq Vx,Ux/Mq (66),(v1) +36: vpermd Vqq,Hqq,Wqq (66),(v) +37: vpcmpgtq Vx,Hx,Wx (66),(v1) +38: vpminsb Vx,Hx,Wx (66),(v1) +39: vpminsd Vx,Hx,Wx (66),(v1) +3a: vpminuw Vx,Hx,Wx (66),(v1) +3b: vpminud Vx,Hx,Wx (66),(v1) +3c: vpmaxsb Vx,Hx,Wx (66),(v1) +3d: vpmaxsd Vx,Hx,Wx (66),(v1) +3e: vpmaxuw Vx,Hx,Wx (66),(v1) +3f: vpmaxud Vx,Hx,Wx (66),(v1) # 0x0f 0x38 0x40-0x8f -40: pmulld Vdq,Wdq (66),(VEX),(o128) -41: phminposuw Vdq,Wdq (66),(VEX),(o128) -80: INVEPT Gd/q,Mdq (66) -81: INVPID Gd/q,Mdq (66) +40: vpmulld Vx,Hx,Wx (66),(v1) +41: vphminposuw Vdq,Wdq (66),(v1) +42: +43: +44: +45: vpsrlvd/q Vx,Hx,Wx (66),(v) +46: vpsravd Vx,Hx,Wx (66),(v) +47: vpsllvd/q Vx,Hx,Wx (66),(v) +# Skip 0x48-0x57 +58: vpbroadcastd Vx,Wx (66),(v) +59: vpbroadcastq Vx,Wx (66),(v) +5a: vbroadcasti128 Vqq,Mdq (66),(v) +# Skip 0x5b-0x77 +78: vpbroadcastb Vx,Wx (66),(v) +79: vpbroadcastw Vx,Wx (66),(v) +# Skip 0x7a-0x7f +80: INVEPT Gy,Mdq (66) +81: INVPID Gy,Mdq (66) +82: INVPCID Gy,Mdq (66) +8c: vpmaskmovd/q Vx,Hx,Mx (66),(v) +8e: vpmaskmovd/q Mx,Vx,Hx (66),(v) # 0x0f 0x38 0x90-0xbf (FMA) -96: vfmaddsub132pd/ps /r (66),(VEX) -97: vfmsubadd132pd/ps /r (66),(VEX) -98: vfmadd132pd/ps /r (66),(VEX) -99: vfmadd132sd/ss /r (66),(VEX),(o128) -9a: vfmsub132pd/ps /r (66),(VEX) -9b: vfmsub132sd/ss /r (66),(VEX),(o128) -9c: vfnmadd132pd/ps /r (66),(VEX) -9d: vfnmadd132sd/ss /r (66),(VEX),(o128) -9e: vfnmsub132pd/ps /r (66),(VEX) -9f: vfnmsub132sd/ss /r (66),(VEX),(o128) -a6: vfmaddsub213pd/ps /r (66),(VEX) -a7: vfmsubadd213pd/ps /r (66),(VEX) -a8: vfmadd213pd/ps /r (66),(VEX) -a9: vfmadd213sd/ss /r (66),(VEX),(o128) -aa: vfmsub213pd/ps /r (66),(VEX) -ab: vfmsub213sd/ss /r (66),(VEX),(o128) -ac: vfnmadd213pd/ps /r (66),(VEX) -ad: vfnmadd213sd/ss /r (66),(VEX),(o128) -ae: vfnmsub213pd/ps /r (66),(VEX) -af: vfnmsub213sd/ss /r (66),(VEX),(o128) -b6: vfmaddsub231pd/ps /r (66),(VEX) -b7: vfmsubadd231pd/ps /r (66),(VEX) -b8: vfmadd231pd/ps /r (66),(VEX) -b9: vfmadd231sd/ss /r (66),(VEX),(o128) -ba: vfmsub231pd/ps /r (66),(VEX) -bb: vfmsub231sd/ss /r (66),(VEX),(o128) -bc: vfnmadd231pd/ps /r (66),(VEX) -bd: vfnmadd231sd/ss /r (66),(VEX),(o128) -be: vfnmsub231pd/ps /r (66),(VEX) -bf: vfnmsub231sd/ss /r (66),(VEX),(o128) +90: vgatherdd/q Vx,Hx,Wx (66),(v) +91: vgatherqd/q Vx,Hx,Wx (66),(v) +92: vgatherdps/d Vx,Hx,Wx (66),(v) +93: vgatherqps/d Vx,Hx,Wx (66),(v) +94: +95: +96: vfmaddsub132ps/d Vx,Hx,Wx (66),(v) +97: vfmsubadd132ps/d Vx,Hx,Wx (66),(v) +98: vfmadd132ps/d Vx,Hx,Wx (66),(v) +99: vfmadd132ss/d Vx,Hx,Wx (66),(v),(v1) +9a: vfmsub132ps/d Vx,Hx,Wx (66),(v) +9b: vfmsub132ss/d Vx,Hx,Wx (66),(v),(v1) +9c: vfnmadd132ps/d Vx,Hx,Wx (66),(v) +9d: vfnmadd132ss/d Vx,Hx,Wx (66),(v),(v1) +9e: vfnmsub132ps/d Vx,Hx,Wx (66),(v) +9f: vfnmsub132ss/d Vx,Hx,Wx (66),(v),(v1) +a6: vfmaddsub213ps/d Vx,Hx,Wx (66),(v) +a7: vfmsubadd213ps/d Vx,Hx,Wx (66),(v) +a8: vfmadd213ps/d Vx,Hx,Wx (66),(v) +a9: vfmadd213ss/d Vx,Hx,Wx (66),(v),(v1) +aa: vfmsub213ps/d Vx,Hx,Wx (66),(v) +ab: vfmsub213ss/d Vx,Hx,Wx (66),(v),(v1) +ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v) +ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1) +ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v) +af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1) +b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v) +b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v) +b8: vfmadd231ps/d Vx,Hx,Wx (66),(v) +b9: vfmadd231ss/d Vx,Hx,Wx (66),(v),(v1) +ba: vfmsub231ps/d Vx,Hx,Wx (66),(v) +bb: vfmsub231ss/d Vx,Hx,Wx (66),(v),(v1) +bc: vfnmadd231ps/d Vx,Hx,Wx (66),(v) +bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1) +be: vfnmsub231ps/d Vx,Hx,Wx (66),(v) +bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1) # 0x0f 0x38 0xc0-0xff -db: aesimc Vdq,Wdq (66),(VEX),(o128) -dc: aesenc Vdq,Wdq (66),(VEX),(o128) -dd: aesenclast Vdq,Wdq (66),(VEX),(o128) -de: aesdec Vdq,Wdq (66),(VEX),(o128) -df: aesdeclast Vdq,Wdq (66),(VEX),(o128) -f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2) -f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2) +db: VAESIMC Vdq,Wdq (66),(v1) +dc: VAESENC Vdq,Hdq,Wdq (66),(v1) +dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) +de: VAESDEC Vdq,Hdq,Wdq (66),(v1) +df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1) +f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) +f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) +f3: ANDN Gy,By,Ey (v) +f4: Grp17 (1A) +f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) +f6: MULX By,Gy,rDX,Ey (F2),(v) +f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) EndTable Table: 3-byte opcode 2 (0x0f 0x3a) Referrer: 3-byte escape 2 AVXcode: 3 # 0x0f 0x3a 0x00-0xff -04: vpermilps /r,Ib (66),(oVEX) -05: vpermilpd /r,Ib (66),(oVEX) -06: vperm2f128 /r,Ib (66),(oVEX),(o256) -08: roundps Vdq,Wdq,Ib (66),(VEX) -09: roundpd Vdq,Wdq,Ib (66),(VEX) -0a: roundss Vss,Wss,Ib (66),(VEX),(o128) -0b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128) -0c: blendps Vdq,Wdq,Ib (66),(VEX) -0d: blendpd Vdq,Wdq,Ib (66),(VEX) -0e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128) -0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128) -14: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128) -15: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128) -16: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128) -17: extractps Ed,Vdq,Ib (66),(VEX),(o128) -18: vinsertf128 /r,Ib (66),(oVEX),(o256) -19: vextractf128 /r,Ib (66),(oVEX),(o256) -20: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128) -21: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128) -22: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128) -40: dpps Vdq,Wdq,Ib (66),(VEX) -41: dppd Vdq,Wdq,Ib (66),(VEX),(o128) -42: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128) -44: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128) -4a: vblendvps /r,Ib (66),(oVEX) -4b: vblendvpd /r,Ib (66),(oVEX) -4c: vpblendvb /r,Ib (66),(oVEX),(o128) -60: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128) -61: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128) -62: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128) -63: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128) -df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128) +00: vpermq Vqq,Wqq,Ib (66),(v) +01: vpermpd Vqq,Wqq,Ib (66),(v) +02: vpblendd Vx,Hx,Wx,Ib (66),(v) +03: +04: vpermilps Vx,Wx,Ib (66),(v) +05: vpermilpd Vx,Wx,Ib (66),(v) +06: vperm2f128 Vqq,Hqq,Wqq,Ib (66),(v) +07: +08: vroundps Vx,Wx,Ib (66) +09: vroundpd Vx,Wx,Ib (66) +0a: vroundss Vss,Wss,Ib (66),(v1) +0b: vroundsd Vsd,Wsd,Ib (66),(v1) +0c: vblendps Vx,Hx,Wx,Ib (66) +0d: vblendpd Vx,Hx,Wx,Ib (66) +0e: vpblendw Vx,Hx,Wx,Ib (66),(v1) +0f: palignr Pq,Qq,Ib | vpalignr Vx,Hx,Wx,Ib (66),(v1) +14: vpextrb Rd/Mb,Vdq,Ib (66),(v1) +15: vpextrw Rd/Mw,Vdq,Ib (66),(v1) +16: vpextrd/q Ey,Vdq,Ib (66),(v1) +17: vextractps Ed,Vdq,Ib (66),(v1) +18: vinsertf128 Vqq,Hqq,Wqq,Ib (66),(v) +19: vextractf128 Wdq,Vqq,Ib (66),(v) +1d: vcvtps2ph Wx,Vx,Ib (66),(v) +20: vpinsrb Vdq,Hdq,Ry/Mb,Ib (66),(v1) +21: vinsertps Vdq,Hdq,Udq/Md,Ib (66),(v1) +22: vpinsrd/q Vdq,Hdq,Ey,Ib (66),(v1) +38: vinserti128 Vqq,Hqq,Wqq,Ib (66),(v) +39: vextracti128 Wdq,Vqq,Ib (66),(v) +40: vdpps Vx,Hx,Wx,Ib (66) +41: vdppd Vdq,Hdq,Wdq,Ib (66),(v1) +42: vmpsadbw Vx,Hx,Wx,Ib (66),(v1) +44: vpclmulqdq Vdq,Hdq,Wdq,Ib (66),(v1) +46: vperm2i128 Vqq,Hqq,Wqq,Ib (66),(v) +4a: vblendvps Vx,Hx,Wx,Lx (66),(v) +4b: vblendvpd Vx,Hx,Wx,Lx (66),(v) +4c: vpblendvb Vx,Hx,Wx,Lx (66),(v1) +60: vpcmpestrm Vdq,Wdq,Ib (66),(v1) +61: vpcmpestri Vdq,Wdq,Ib (66),(v1) +62: vpcmpistrm Vdq,Wdq,Ib (66),(v1) +63: vpcmpistri Vdq,Wdq,Ib (66),(v1) +df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1) +f0: RORX Gy,Ey,Ib (F2),(v) EndTable GrpTable: Grp1 @@ -790,7 +843,7 @@ GrpTable: Grp5 2: CALLN Ev (f64) 3: CALLF Ep 4: JMPN Ev (f64) -5: JMPF Ep +5: JMPF Mp 6: PUSH Ev (d64) 7: EndTable @@ -807,7 +860,7 @@ EndTable GrpTable: Grp7 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) 1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) -2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) 3: LIDT Ms 4: SMSW Mw/Rv 5: @@ -824,44 +877,45 @@ EndTable GrpTable: Grp9 1: CMPXCHG8B/16B Mq/Mdq -6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) -7: VMPTRST Mq +6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) +7: VMPTRST Mq | VMPTRST Mq (F3) EndTable GrpTable: Grp10 EndTable GrpTable: Grp11 +# Note: the operands are given by group opcode 0: MOV EndTable GrpTable: Grp12 -2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128) -4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128) -6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128) +2: psrlw Nq,Ib (11B) | vpsrlw Hx,Ux,Ib (66),(11B),(v1) +4: psraw Nq,Ib (11B) | vpsraw Hx,Ux,Ib (66),(11B),(v1) +6: psllw Nq,Ib (11B) | vpsllw Hx,Ux,Ib (66),(11B),(v1) EndTable GrpTable: Grp13 -2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128) -4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128) -6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128) +2: psrld Nq,Ib (11B) | vpsrld Hx,Ux,Ib (66),(11B),(v1) +4: psrad Nq,Ib (11B) | vpsrad Hx,Ux,Ib (66),(11B),(v1) +6: pslld Nq,Ib (11B) | vpslld Hx,Ux,Ib (66),(11B),(v1) EndTable GrpTable: Grp14 -2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128) -3: psrldq Udq,Ib (66),(11B),(VEX),(o128) -6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128) -7: pslldq Udq,Ib (66),(11B),(VEX),(o128) +2: psrlq Nq,Ib (11B) | vpsrlq Hx,Ux,Ib (66),(11B),(v1) +3: vpsrldq Hx,Ux,Ib (66),(11B),(v1) +6: psllq Nq,Ib (11B) | vpsllq Hx,Ux,Ib (66),(11B),(v1) +7: vpslldq Hx,Ux,Ib (66),(11B),(v1) EndTable GrpTable: Grp15 -0: fxsave -1: fxstor -2: ldmxcsr (VEX) -3: stmxcsr (VEX) +0: fxsave | RDFSBASE Ry (F3),(11B) +1: fxstor | RDGSBASE Ry (F3),(11B) +2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B) +3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B) 4: XSAVE 5: XRSTOR | lfence (11B) -6: mfence (11B) +6: XSAVEOPT | mfence (11B) 7: clflush | sfence (11B) EndTable @@ -872,6 +926,12 @@ GrpTable: Grp16 3: prefetch T2 EndTable +GrpTable: Grp17 +1: BLSR By,Ey (v) +2: BLSMSK By,Ey (v) +3: BLSI By,Ey (v) +EndTable + # AMD's Prefetch Group GrpTable: GrpP 0: PREFETCH diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 3d11327c9ab..23d8e5fecf7 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -27,6 +27,4 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o -obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o - obj-$(CONFIG_MEMTEST) += memtest.o diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index d0474ad2a6e..1fb85dbe390 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -25,7 +25,7 @@ int fixup_exception(struct pt_regs *regs) if (fixup) { /* If fixup is less than 16, it means uaccess error */ if (fixup->fixup < 16) { - current_thread_info()->uaccess_err = -EFAULT; + current_thread_info()->uaccess_err = 1; regs->ip += fixup->fixup; return 1; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 5db0490deb0..9d74824a708 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -626,7 +626,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, static noinline void no_context(struct pt_regs *regs, unsigned long error_code, - unsigned long address) + unsigned long address, int signal, int si_code) { struct task_struct *tsk = current; unsigned long *stackend; @@ -634,8 +634,17 @@ no_context(struct pt_regs *regs, unsigned long error_code, int sig; /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) + if (fixup_exception(regs)) { + if (current_thread_info()->sig_on_uaccess_error && signal) { + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code | PF_USER; + tsk->thread.cr2 = address; + + /* XXX: hwpoison faults will set the wrong code. */ + force_sig_info_fault(signal, si_code, address, tsk, 0); + } return; + } /* * 32-bit: @@ -755,7 +764,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_f00f_bug(regs, address)) return; - no_context(regs, error_code, address); + no_context(regs, error_code, address, SIGSEGV, si_code); } static noinline void @@ -819,7 +828,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) { - no_context(regs, error_code, address); + no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); return; } @@ -854,7 +863,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, if (!(fault & VM_FAULT_RETRY)) up_read(¤t->mm->mmap_sem); if (!(error_code & PF_USER)) - no_context(regs, error_code, address); + no_context(regs, error_code, address, 0, 0); return 1; } if (!(fault & VM_FAULT_ERROR)) @@ -864,7 +873,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) { up_read(¤t->mm->mmap_sem); - no_context(regs, error_code, address); + no_context(regs, error_code, address, + SIGSEGV, SEGV_MAPERR); return 1; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 87488b93a65..a298914058f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -67,7 +67,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse, good_end = max_pfn_mapped << PAGE_SHIFT; base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); - if (base == MEMBLOCK_ERROR) + if (!base) panic("Cannot find space for the kernel page tables"); pgt_buf_start = base >> PAGE_SHIFT; @@ -80,7 +80,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse, void __init native_pagetable_reserve(u64 start, u64 end) { - memblock_x86_reserve_range(start, end, "PGTABLE"); + memblock_reserve(start, end - start); } struct map_range { @@ -279,8 +279,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) * so that they can be reused for other purposes. * - * On native it just means calling memblock_x86_reserve_range, on Xen it - * also means marking RW the pagetable pages that we allocated before + * On native it just means calling memblock_reserve, on Xen it also + * means marking RW the pagetable pages that we allocated before * but that haven't been used. * * In fact on xen we mark RO the whole range pgt_buf_start - diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 29f7c6d9817..0c1da394a63 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -427,23 +427,17 @@ static void __init add_one_highpage_init(struct page *page) void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn) { - struct range *range; - int nr_range; - int i; - - nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn); - - for (i = 0; i < nr_range; i++) { - struct page *page; - int node_pfn; - - for (node_pfn = range[i].start; node_pfn < range[i].end; - node_pfn++) { - if (!pfn_valid(node_pfn)) - continue; - page = pfn_to_page(node_pfn); - add_one_highpage_init(page); - } + phys_addr_t start, end; + u64 i; + + for_each_free_mem_range(i, nid, &start, &end, NULL) { + unsigned long pfn = clamp_t(unsigned long, PFN_UP(start), + start_pfn, end_pfn); + unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end), + start_pfn, end_pfn); + for ( ; pfn < e_pfn; pfn++) + if (pfn_valid(pfn)) + add_one_highpage_init(pfn_to_page(pfn)); } } #else @@ -650,18 +644,18 @@ void __init initmem_init(void) highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) highstart_pfn = max_low_pfn; - memblock_x86_register_active_regions(0, 0, highend_pfn); - sparse_memory_present_with_active_regions(0); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - memblock_x86_register_active_regions(0, 0, max_low_pfn); - sparse_memory_present_with_active_regions(0); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif + + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + sparse_memory_present_with_active_regions(0); + #ifdef CONFIG_FLATMEM max_mapnr = num_physpages; #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bbaaa005bf0..a8a56ce3a96 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -608,7 +608,7 @@ kernel_physical_mapping_init(unsigned long start, #ifndef CONFIG_NUMA void __init initmem_init(void) { - memblock_x86_register_active_regions(0, 0, max_pfn); + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); } #endif diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c deleted file mode 100644 index 992da5ec5a6..00000000000 --- a/arch/x86/mm/memblock.c +++ /dev/null @@ -1,348 +0,0 @@ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/bitops.h> -#include <linux/memblock.h> -#include <linux/bootmem.h> -#include <linux/mm.h> -#include <linux/range.h> - -/* Check for already reserved areas */ -bool __init memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align) -{ - struct memblock_region *r; - u64 addr = *addrp, last; - u64 size = *sizep; - bool changed = false; - -again: - last = addr + size; - for_each_memblock(reserved, r) { - if (last > r->base && addr < r->base) { - size = r->base - addr; - changed = true; - goto again; - } - if (last > (r->base + r->size) && addr < (r->base + r->size)) { - addr = round_up(r->base + r->size, align); - size = last - addr; - changed = true; - goto again; - } - if (last <= (r->base + r->size) && addr >= r->base) { - *sizep = 0; - return false; - } - } - if (changed) { - *addrp = addr; - *sizep = size; - } - return changed; -} - -/* - * Find next free range after start, and size is returned in *sizep - */ -u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align) -{ - struct memblock_region *r; - - for_each_memblock(memory, r) { - u64 ei_start = r->base; - u64 ei_last = ei_start + r->size; - u64 addr; - - addr = round_up(ei_start, align); - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - *sizep = ei_last - addr; - while (memblock_x86_check_reserved_size(&addr, sizep, align)) - ; - - if (*sizep) - return addr; - } - - return MEMBLOCK_ERROR; -} - -static __init struct range *find_range_array(int count) -{ - u64 end, size, mem; - struct range *range; - - size = sizeof(struct range) * count; - end = memblock.current_limit; - - mem = memblock_find_in_range(0, end, size, sizeof(struct range)); - if (mem == MEMBLOCK_ERROR) - panic("can not find more space for range array"); - - /* - * This range is tempoaray, so don't reserve it, it will not be - * overlapped because We will not alloccate new buffer before - * We discard this one - */ - range = __va(mem); - memset(range, 0, size); - - return range; -} - -static void __init memblock_x86_subtract_reserved(struct range *range, int az) -{ - u64 final_start, final_end; - struct memblock_region *r; - - /* Take out region array itself at first*/ - memblock_free_reserved_regions(); - - memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt); - - for_each_memblock(reserved, r) { - memblock_dbg(" [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1); - final_start = PFN_DOWN(r->base); - final_end = PFN_UP(r->base + r->size); - if (final_start >= final_end) - continue; - subtract_range(range, az, final_start, final_end); - } - - /* Put region array back ? */ - memblock_reserve_reserved_regions(); -} - -struct count_data { - int nr; -}; - -static int __init count_work_fn(unsigned long start_pfn, - unsigned long end_pfn, void *datax) -{ - struct count_data *data = datax; - - data->nr++; - - return 0; -} - -static int __init count_early_node_map(int nodeid) -{ - struct count_data data; - - data.nr = 0; - work_with_active_regions(nodeid, count_work_fn, &data); - - return data.nr; -} - -int __init __get_free_all_memory_range(struct range **rangep, int nodeid, - unsigned long start_pfn, unsigned long end_pfn) -{ - int count; - struct range *range; - int nr_range; - - count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2; - - range = find_range_array(count); - nr_range = 0; - - /* - * Use early_node_map[] and memblock.reserved.region to get range array - * at first - */ - nr_range = add_from_early_node_map(range, count, nr_range, nodeid); - subtract_range(range, count, 0, start_pfn); - subtract_range(range, count, end_pfn, -1ULL); - - memblock_x86_subtract_reserved(range, count); - nr_range = clean_sort_range(range, count); - - *rangep = range; - return nr_range; -} - -int __init get_free_all_memory_range(struct range **rangep, int nodeid) -{ - unsigned long end_pfn = -1UL; - -#ifdef CONFIG_X86_32 - end_pfn = max_low_pfn; -#endif - return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn); -} - -static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free) -{ - int i, count; - struct range *range; - int nr_range; - u64 final_start, final_end; - u64 free_size; - struct memblock_region *r; - - count = (memblock.reserved.cnt + memblock.memory.cnt) * 2; - - range = find_range_array(count); - nr_range = 0; - - addr = PFN_UP(addr); - limit = PFN_DOWN(limit); - - for_each_memblock(memory, r) { - final_start = PFN_UP(r->base); - final_end = PFN_DOWN(r->base + r->size); - if (final_start >= final_end) - continue; - if (final_start >= limit || final_end <= addr) - continue; - - nr_range = add_range(range, count, nr_range, final_start, final_end); - } - subtract_range(range, count, 0, addr); - subtract_range(range, count, limit, -1ULL); - - /* Subtract memblock.reserved.region in range ? */ - if (!get_free) - goto sort_and_count_them; - for_each_memblock(reserved, r) { - final_start = PFN_DOWN(r->base); - final_end = PFN_UP(r->base + r->size); - if (final_start >= final_end) - continue; - if (final_start >= limit || final_end <= addr) - continue; - - subtract_range(range, count, final_start, final_end); - } - -sort_and_count_them: - nr_range = clean_sort_range(range, count); - - free_size = 0; - for (i = 0; i < nr_range; i++) - free_size += range[i].end - range[i].start; - - return free_size << PAGE_SHIFT; -} - -u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit) -{ - return __memblock_x86_memory_in_range(addr, limit, true); -} - -u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit) -{ - return __memblock_x86_memory_in_range(addr, limit, false); -} - -void __init memblock_x86_reserve_range(u64 start, u64 end, char *name) -{ - if (start == end) - return; - - if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end)) - return; - - memblock_dbg(" memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name); - - memblock_reserve(start, end - start); -} - -void __init memblock_x86_free_range(u64 start, u64 end) -{ - if (start == end) - return; - - if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end)) - return; - - memblock_dbg(" memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1); - - memblock_free(start, end - start); -} - -/* - * Need to call this function after memblock_x86_register_active_regions, - * so early_node_map[] is filled already. - */ -u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align) -{ - u64 addr; - addr = find_memory_core_early(nid, size, align, start, end); - if (addr != MEMBLOCK_ERROR) - return addr; - - /* Fallback, should already have start end within node range */ - return memblock_find_in_range(start, end, size, align); -} - -/* - * Finds an active region in the address range from start_pfn to last_pfn and - * returns its range in ei_startpfn and ei_endpfn for the memblock entry. - */ -static int __init memblock_x86_find_active_region(const struct memblock_region *ei, - unsigned long start_pfn, - unsigned long last_pfn, - unsigned long *ei_startpfn, - unsigned long *ei_endpfn) -{ - u64 align = PAGE_SIZE; - - *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT; - *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT; - - /* Skip map entries smaller than a page */ - if (*ei_startpfn >= *ei_endpfn) - return 0; - - /* Skip if map is outside the node */ - if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn) - return 0; - - /* Check for overlaps */ - if (*ei_startpfn < start_pfn) - *ei_startpfn = start_pfn; - if (*ei_endpfn > last_pfn) - *ei_endpfn = last_pfn; - - return 1; -} - -/* Walk the memblock.memory map and register active regions within a node */ -void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn, - unsigned long last_pfn) -{ - unsigned long ei_startpfn; - unsigned long ei_endpfn; - struct memblock_region *r; - - for_each_memblock(memory, r) - if (memblock_x86_find_active_region(r, start_pfn, last_pfn, - &ei_startpfn, &ei_endpfn)) - add_active_range(nid, ei_startpfn, ei_endpfn); -} - -/* - * Find the hole size (in bytes) in the memory range. - * @start: starting address of the memory range to scan - * @end: ending address of the memory range to scan - */ -u64 __init memblock_x86_hole_size(u64 start, u64 end) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long last_pfn = end >> PAGE_SHIFT; - unsigned long ei_startpfn, ei_endpfn, ram = 0; - struct memblock_region *r; - - for_each_memblock(memory, r) - if (memblock_x86_find_active_region(r, start_pfn, last_pfn, - &ei_startpfn, &ei_endpfn)) - ram += ei_endpfn - ei_startpfn; - - return end - start - ((u64)ram << PAGE_SHIFT); -} diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 92faf3a1c53..c80b9fb9573 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -34,7 +34,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) (unsigned long long) pattern, (unsigned long long) start_bad, (unsigned long long) end_bad); - memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM"); + memblock_reserve(start_bad, end_bad - start_bad); } static void __init memtest(u64 pattern, u64 start_phys, u64 size) @@ -70,24 +70,19 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size) static void __init do_one_pass(u64 pattern, u64 start, u64 end) { - u64 size = 0; - - while (start < end) { - start = memblock_x86_find_in_range_size(start, &size, 1); - - /* done ? */ - if (start >= end) - break; - if (start + size > end) - size = end - start; - - printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", - (unsigned long long) start, - (unsigned long long) start + size, - (unsigned long long) cpu_to_be64(pattern)); - memtest(pattern, start, size); - - start += size; + u64 i; + phys_addr_t this_start, this_end; + + for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) { + this_start = clamp_t(phys_addr_t, this_start, start, end); + this_end = clamp_t(phys_addr_t, this_end, start, end); + if (this_start < this_end) { + printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", + (unsigned long long)this_start, + (unsigned long long)this_end, + (unsigned long long)cpu_to_be64(pattern)); + memtest(pattern, this_start, this_end - this_start); + } } } diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index fbeaaf41661..496f494593b 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -192,8 +192,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) /* Initialize NODE_DATA for a node on the local memory */ static void __init setup_node_data(int nid, u64 start, u64 end) { - const u64 nd_low = PFN_PHYS(MAX_DMA_PFN); - const u64 nd_high = PFN_PHYS(max_pfn_mapped); const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); bool remapped = false; u64 nd_pa; @@ -224,17 +222,12 @@ static void __init setup_node_data(int nid, u64 start, u64 end) nd_pa = __pa(nd); remapped = true; } else { - nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, - nd_size, SMP_CACHE_BYTES); - if (nd_pa == MEMBLOCK_ERROR) - nd_pa = memblock_find_in_range(nd_low, nd_high, - nd_size, SMP_CACHE_BYTES); - if (nd_pa == MEMBLOCK_ERROR) { + nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); + if (!nd_pa) { pr_err("Cannot find %zu bytes in node %d\n", nd_size, nid); return; } - memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); nd = __va(nd_pa); } @@ -371,8 +364,7 @@ void __init numa_reset_distance(void) /* numa_distance could be 1LU marking allocation failure, test cnt */ if (numa_distance_cnt) - memblock_x86_free_range(__pa(numa_distance), - __pa(numa_distance) + size); + memblock_free(__pa(numa_distance), size); numa_distance_cnt = 0; numa_distance = NULL; /* enable table creation */ } @@ -395,13 +387,13 @@ static int __init numa_alloc_distance(void) phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), size, PAGE_SIZE); - if (phys == MEMBLOCK_ERROR) { + if (!phys) { pr_warning("NUMA: Warning: can't allocate distance table!\n"); /* don't retry until explicitly reset */ numa_distance = (void *)1LU; return -ENOMEM; } - memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); + memblock_reserve(phys, size); numa_distance = __va(phys); numa_distance_cnt = cnt; @@ -482,8 +474,8 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) numaram = 0; } - e820ram = max_pfn - (memblock_x86_hole_size(0, - PFN_PHYS(max_pfn)) >> PAGE_SHIFT); + e820ram = max_pfn - absent_pages_in_range(0, max_pfn); + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", @@ -505,13 +497,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) if (WARN_ON(nodes_empty(node_possible_map))) return -EINVAL; - for (i = 0; i < mi->nr_blks; i++) - memblock_x86_register_active_regions(mi->blk[i].nid, - mi->blk[i].start >> PAGE_SHIFT, - mi->blk[i].end >> PAGE_SHIFT); - - /* for out of order entries */ - sort_node_map(); + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *mb = &mi->blk[i]; + memblock_set_node(mb->start, mb->end - mb->start, mb->nid); + } /* * If sections array is gonna be used for pfn -> nid mapping, check @@ -545,6 +534,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) setup_node_data(nid, start, end); } + /* Dump memblock with node info and return. */ + memblock_dump_all(); return 0; } @@ -582,7 +573,7 @@ static int __init numa_init(int (*init_func)(void)) nodes_clear(node_possible_map); nodes_clear(node_online_map); memset(&numa_meminfo, 0, sizeof(numa_meminfo)); - remove_all_active_ranges(); + WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); numa_reset_distance(); ret = init_func(); diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 3adebe7e536..534255a36b6 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -199,23 +199,23 @@ void __init init_alloc_remap(int nid, u64 start, u64 end) /* allocate node memory and the lowmem remap area */ node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES); - if (node_pa == MEMBLOCK_ERROR) { + if (!node_pa) { pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", size, nid); return; } - memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); + memblock_reserve(node_pa, size); remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT, size, LARGE_PAGE_BYTES); - if (remap_pa == MEMBLOCK_ERROR) { + if (!remap_pa) { pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", size, nid); - memblock_x86_free_range(node_pa, node_pa + size); + memblock_free(node_pa, size); return; } - memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG"); + memblock_reserve(remap_pa, size); remap_va = phys_to_virt(remap_pa); /* perform actual remap */ diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index dd27f401f0a..92e27119ee1 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -19,7 +19,7 @@ unsigned long __init numa_free_all_bootmem(void) for_each_online_node(i) pages += free_all_bootmem_node(NODE_DATA(i)); - pages += free_all_memory_core_early(MAX_NUMNODES); + pages += free_low_memory_core_early(MAX_NUMNODES); return pages; } diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index d0ed086b624..46db56845f1 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -28,6 +28,16 @@ static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) return -ENOENT; } +static u64 mem_hole_size(u64 start, u64 end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = PFN_DOWN(end); + + if (start_pfn < end_pfn) + return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); + return 0; +} + /* * Sets up nid to range from @start to @end. The return value is -errno if * something went wrong, 0 otherwise. @@ -89,7 +99,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * Calculate target node size. x86_32 freaks on __udivdi3() so do * the division in ulong number of pages and convert back. */ - size = max_addr - addr - memblock_x86_hole_size(addr, max_addr); + size = max_addr - addr - mem_hole_size(addr, max_addr); size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); /* @@ -135,8 +145,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * Continue to add memory to this fake node if its * non-reserved memory is less than the per-node size. */ - while (end - start - - memblock_x86_hole_size(start, end) < size) { + while (end - start - mem_hole_size(start, end) < size) { end += FAKE_NODE_MIN_SIZE; if (end > limit) { end = limit; @@ -150,7 +159,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * this one must extend to the boundary. */ if (end < dma32_end && dma32_end - end - - memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) end = dma32_end; /* @@ -158,8 +167,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * next node, this one must extend to the end of the * physical node. */ - if (limit - end - - memblock_x86_hole_size(end, limit) < size) + if (limit - end - mem_hole_size(end, limit) < size) end = limit; ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, @@ -180,7 +188,7 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) { u64 end = start + size; - while (end - start - memblock_x86_hole_size(start, end) < size) { + while (end - start - mem_hole_size(start, end) < size) { end += FAKE_NODE_MIN_SIZE; if (end > max_addr) { end = max_addr; @@ -211,8 +219,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, * creates a uniform distribution of node sizes across the entire * machine (but not necessarily over physical nodes). */ - min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / - MAX_NUMNODES; + min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES; min_size = max(min_size, FAKE_NODE_MIN_SIZE); if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) min_size = (min_size + FAKE_NODE_MIN_SIZE) & @@ -252,7 +259,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, * this one must extend to the boundary. */ if (end < dma32_end && dma32_end - end - - memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) end = dma32_end; /* @@ -260,8 +267,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, * next node, this one must extend to the end of the * physical node. */ - if (limit - end - - memblock_x86_hole_size(end, limit) < size) + if (limit - end - mem_hole_size(end, limit) < size) end = limit; ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, @@ -351,11 +357,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), phys_size, PAGE_SIZE); - if (phys == MEMBLOCK_ERROR) { + if (!phys) { pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); goto no_emu; } - memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST"); + memblock_reserve(phys, phys_size); phys_dist = __va(phys); for (i = 0; i < numa_dist_cnt; i++) @@ -424,7 +430,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) /* free the copied physical distance table */ if (phys_dist) - memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size); + memblock_free(__pa(phys_dist), phys_size); return; no_emu: diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index f9e526742fa..eda2acbb6e8 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -998,7 +998,7 @@ out_err: } EXPORT_SYMBOL(set_memory_uc); -int _set_memory_array(unsigned long *addr, int addrinarray, +static int _set_memory_array(unsigned long *addr, int addrinarray, unsigned long new_type) { int i, j; diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 81dbfdeb080..fd61b3fb734 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -69,6 +69,12 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) return; pxm = pa->proximity_domain; + apic_id = pa->apic_id; + if (!cpu_has_x2apic && (apic_id >= 0xff)) { + printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n", + pxm, apic_id); + return; + } node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); @@ -76,7 +82,6 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) return; } - apic_id = pa->apic_id; if (apic_id >= MAX_LOCAL_APIC) { printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); return; diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile index 446902b2a6b..1599f568f0e 100644 --- a/arch/x86/oprofile/Makefile +++ b/arch/x86/oprofile/Makefile @@ -4,9 +4,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ oprof.o cpu_buffer.o buffer_sync.o \ event_buffer.o oprofile_files.o \ oprofilefs.o oprofile_stats.o \ - timer_int.o ) + timer_int.o nmi_timer_int.o ) oprofile-y := $(DRIVER_OBJS) init.o backtrace.o oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \ op_model_ppro.o op_model_p4.o -oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c index f148cf65267..9e138d00ad3 100644 --- a/arch/x86/oprofile/init.c +++ b/arch/x86/oprofile/init.c @@ -16,37 +16,23 @@ * with the NMI mode driver. */ +#ifdef CONFIG_X86_LOCAL_APIC extern int op_nmi_init(struct oprofile_operations *ops); -extern int op_nmi_timer_init(struct oprofile_operations *ops); extern void op_nmi_exit(void); -extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth); +#else +static int op_nmi_init(struct oprofile_operations *ops) { return -ENODEV; } +static void op_nmi_exit(void) { } +#endif -static int nmi_timer; +extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth); int __init oprofile_arch_init(struct oprofile_operations *ops) { - int ret; - - ret = -ENODEV; - -#ifdef CONFIG_X86_LOCAL_APIC - ret = op_nmi_init(ops); -#endif - nmi_timer = (ret != 0); -#ifdef CONFIG_X86_IO_APIC - if (nmi_timer) - ret = op_nmi_timer_init(ops); -#endif ops->backtrace = x86_backtrace; - - return ret; + return op_nmi_init(ops); } - void oprofile_arch_exit(void) { -#ifdef CONFIG_X86_LOCAL_APIC - if (!nmi_timer) - op_nmi_exit(); -#endif + op_nmi_exit(); } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 75f9528e037..26b8a8514ee 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -595,24 +595,36 @@ static int __init p4_init(char **cpu_type) return 0; } -static int force_arch_perfmon; -static int force_cpu_type(const char *str, struct kernel_param *kp) +enum __force_cpu_type { + reserved = 0, /* do not force */ + timer, + arch_perfmon, +}; + +static int force_cpu_type; + +static int set_cpu_type(const char *str, struct kernel_param *kp) { - if (!strcmp(str, "arch_perfmon")) { - force_arch_perfmon = 1; + if (!strcmp(str, "timer")) { + force_cpu_type = timer; + printk(KERN_INFO "oprofile: forcing NMI timer mode\n"); + } else if (!strcmp(str, "arch_perfmon")) { + force_cpu_type = arch_perfmon; printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); + } else { + force_cpu_type = 0; } return 0; } -module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); +module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0); static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ - if (force_arch_perfmon && cpu_has_arch_perfmon) + if (force_cpu_type == arch_perfmon && cpu_has_arch_perfmon) return 0; /* @@ -679,6 +691,9 @@ int __init op_nmi_init(struct oprofile_operations *ops) if (!cpu_has_apic) return -ENODEV; + if (force_cpu_type == timer) + return -ENODEV; + switch (vendor) { case X86_VENDOR_AMD: /* Needs to be at least an Athlon (or hammer in 32bit mode) */ diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c deleted file mode 100644 index 7f8052cd662..00000000000 --- a/arch/x86/oprofile/nmi_timer_int.c +++ /dev/null @@ -1,50 +0,0 @@ -/** - * @file nmi_timer_int.c - * - * @remark Copyright 2003 OProfile authors - * @remark Read the file COPYING - * - * @author Zwane Mwaikambo <zwane@linuxpower.ca> - */ - -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/errno.h> -#include <linux/oprofile.h> -#include <linux/rcupdate.h> -#include <linux/kdebug.h> - -#include <asm/nmi.h> -#include <asm/apic.h> -#include <asm/ptrace.h> - -static int profile_timer_exceptions_notify(unsigned int val, struct pt_regs *regs) -{ - oprofile_add_sample(regs, 0); - return NMI_HANDLED; -} - -static int timer_start(void) -{ - if (register_nmi_handler(NMI_LOCAL, profile_timer_exceptions_notify, - 0, "oprofile-timer")) - return 1; - return 0; -} - - -static void timer_stop(void) -{ - unregister_nmi_handler(NMI_LOCAL, "oprofile-timer"); - synchronize_sched(); /* Allow already-started NMIs to complete. */ -} - - -int __init op_nmi_timer_init(struct oprofile_operations *ops) -{ - ops->start = timer_start; - ops->stop = timer_stop; - ops->cpu_type = "timer"; - printk(KERN_INFO "oprofile: using NMI timer interrupt.\n"); - return 0; -} diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 37718f0f053..4cf9bd0a165 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -238,7 +238,8 @@ static efi_status_t __init phys_efi_get_time(efi_time_t *tm, spin_lock_irqsave(&rtc_lock, flags); efi_call_phys_prelog(); - status = efi_call_phys2(efi_phys.get_time, tm, tc); + status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm), + virt_to_phys(tc)); efi_call_phys_epilog(); spin_unlock_irqrestore(&rtc_lock, flags); return status; @@ -352,8 +353,7 @@ void __init efi_memblock_x86_reserve_range(void) boot_params.efi_info.efi_memdesc_size; memmap.desc_version = boot_params.efi_info.efi_memdesc_version; memmap.desc_size = boot_params.efi_info.efi_memdesc_size; - memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size, - "EFI memmap"); + memblock_reserve(pmap, memmap.nr_map * memmap.desc_size); } #if EFI_DEBUG @@ -397,16 +397,14 @@ void __init efi_reserve_boot_services(void) if ((start+size >= virt_to_phys(_text) && start <= virt_to_phys(_end)) || !e820_all_mapped(start, start+size, E820_RAM) || - memblock_x86_check_reserved_size(&start, &size, - 1<<EFI_PAGE_SHIFT)) { + memblock_is_region_reserved(start, size)) { /* Could not reserve, skip it */ md->num_pages = 0; memblock_dbg(PFX "Could not reserve boot range " "[0x%010llx-0x%010llx]\n", start, start+size-1); } else - memblock_x86_reserve_range(start, start+size, - "EFI Boot"); + memblock_reserve(start, size); } } diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index f8208267733..d511aa97533 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -18,14 +18,21 @@ chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk quiet_cmd_posttest = TEST $@ cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose) -posttest: $(obj)/test_get_len vmlinux +quiet_cmd_sanitytest = TEST $@ + cmd_sanitytest = $(obj)/insn_sanity $(posttest_64bit) -m 1000000 + +posttest: $(obj)/test_get_len vmlinux $(obj)/insn_sanity $(call cmd,posttest) + $(call cmd,sanitytest) -hostprogs-y := test_get_len +hostprogs-y += test_get_len insn_sanity # -I needed for generated C source and C source which in the kernel tree. HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ +HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ + # Dependencies are also needed. $(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c +$(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index eaf11f52fc0..5f6a5b6c3a1 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -47,7 +47,7 @@ BEGIN { sep_expr = "^\\|$" group_expr = "^Grp[0-9A-Za-z]+" - imm_expr = "^[IJAO][a-z]" + imm_expr = "^[IJAOL][a-z]" imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)" @@ -59,6 +59,7 @@ BEGIN { imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)" imm_flag["Ob"] = "INAT_MOFFSET" imm_flag["Ov"] = "INAT_MOFFSET" + imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" @@ -70,8 +71,12 @@ BEGIN { lprefix3_expr = "\\(F2\\)" max_lprefix = 4 - vexok_expr = "\\(VEX\\)" - vexonly_expr = "\\(oVEX\\)" + # All opcodes starting with lower-case 'v' or with (v1) superscript + # accepts VEX prefix + vexok_opcode_expr = "^v.*" + vexok_expr = "\\(v1\\)" + # All opcodes with (v) superscript supports *only* VEX prefix + vexonly_expr = "\\(v\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -85,8 +90,8 @@ BEGIN { prefix_num["SEG=GS"] = "INAT_PFX_GS" prefix_num["SEG=SS"] = "INAT_PFX_SS" prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ" - prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2" - prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3" + prefix_num["VEX+1byte"] = "INAT_PFX_VEX2" + prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" clear_vars() } @@ -310,12 +315,10 @@ function convert_operands(count,opnd, i,j,imm,mod) if (match(opcode, fpu_expr)) flags = add_flags(flags, "INAT_MODRM") - # check VEX only code + # check VEX codes if (match(ext, vexonly_expr)) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") - - # check VEX only code - if (match(ext, vexok_expr)) + else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) flags = add_flags(flags, "INAT_VEXOK") # check prefixes diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c new file mode 100644 index 00000000000..cc2f8c13128 --- /dev/null +++ b/arch/x86/tools/insn_sanity.c @@ -0,0 +1,275 @@ +/* + * x86 decoder sanity test - based on test_get_insn.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + * Copyright (C) Hitachi, Ltd., 2011 + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +#define unlikely(cond) (cond) +#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0])) + +#include <asm/insn.h> +#include <inat.c> +#include <insn.c> + +/* + * Test of instruction analysis against tampering. + * Feed random binary to instruction decoder and ensure not to + * access out-of-instruction-buffer. + */ + +#define DEFAULT_MAX_ITER 10000 +#define INSN_NOP 0x90 + +static const char *prog; /* Program name */ +static int verbose; /* Verbosity */ +static int x86_64; /* x86-64 bit mode flag */ +static unsigned int seed; /* Random seed */ +static unsigned long iter_start; /* Start of iteration number */ +static unsigned long iter_end = DEFAULT_MAX_ITER; /* End of iteration number */ +static FILE *input_file; /* Input file name */ + +static void usage(const char *err) +{ + if (err) + fprintf(stderr, "Error: %s\n\n", err); + fprintf(stderr, "Usage: %s [-y|-n|-v] [-s seed[,no]] [-m max] [-i input]\n", prog); + fprintf(stderr, "\t-y 64bit mode\n"); + fprintf(stderr, "\t-n 32bit mode\n"); + fprintf(stderr, "\t-v Verbosity(-vv dumps any decoded result)\n"); + fprintf(stderr, "\t-s Give a random seed (and iteration number)\n"); + fprintf(stderr, "\t-m Give a maximum iteration number\n"); + fprintf(stderr, "\t-i Give an input file with decoded binary\n"); + exit(1); +} + +static void dump_field(FILE *fp, const char *name, const char *indent, + struct insn_field *field) +{ + fprintf(fp, "%s.%s = {\n", indent, name); + fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n", + indent, field->value, field->bytes[0], field->bytes[1], + field->bytes[2], field->bytes[3]); + fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent, + field->got, field->nbytes); +} + +static void dump_insn(FILE *fp, struct insn *insn) +{ + fprintf(fp, "Instruction = {\n"); + dump_field(fp, "prefixes", "\t", &insn->prefixes); + dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix); + dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix); + dump_field(fp, "opcode", "\t", &insn->opcode); + dump_field(fp, "modrm", "\t", &insn->modrm); + dump_field(fp, "sib", "\t", &insn->sib); + dump_field(fp, "displacement", "\t", &insn->displacement); + dump_field(fp, "immediate1", "\t", &insn->immediate1); + dump_field(fp, "immediate2", "\t", &insn->immediate2); + fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n", + insn->attr, insn->opnd_bytes, insn->addr_bytes); + fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n", + insn->length, insn->x86_64, insn->kaddr); +} + +static void dump_stream(FILE *fp, const char *msg, unsigned long nr_iter, + unsigned char *insn_buf, struct insn *insn) +{ + int i; + + fprintf(fp, "%s:\n", msg); + + dump_insn(fp, insn); + + fprintf(fp, "You can reproduce this with below command(s);\n"); + + /* Input a decoded instruction sequence directly */ + fprintf(fp, " $ echo "); + for (i = 0; i < MAX_INSN_SIZE; i++) + fprintf(fp, " %02x", insn_buf[i]); + fprintf(fp, " | %s -i -\n", prog); + + if (!input_file) { + fprintf(fp, "Or \n"); + /* Give a seed and iteration number */ + fprintf(fp, " $ %s -s 0x%x,%lu\n", prog, seed, nr_iter); + } +} + +static void init_random_seed(void) +{ + int fd; + + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) + goto fail; + + if (read(fd, &seed, sizeof(seed)) != sizeof(seed)) + goto fail; + + close(fd); + return; +fail: + usage("Failed to open /dev/urandom"); +} + +/* Read given instruction sequence from the input file */ +static int read_next_insn(unsigned char *insn_buf) +{ + char buf[256] = "", *tmp; + int i; + + tmp = fgets(buf, ARRAY_SIZE(buf), input_file); + if (tmp == NULL || feof(input_file)) + return 0; + + for (i = 0; i < MAX_INSN_SIZE; i++) { + insn_buf[i] = (unsigned char)strtoul(tmp, &tmp, 16); + if (*tmp != ' ') + break; + } + + return i; +} + +static int generate_insn(unsigned char *insn_buf) +{ + int i; + + if (input_file) + return read_next_insn(insn_buf); + + /* Fills buffer with random binary up to MAX_INSN_SIZE */ + for (i = 0; i < MAX_INSN_SIZE - 1; i += 2) + *(unsigned short *)(&insn_buf[i]) = random() & 0xffff; + + while (i < MAX_INSN_SIZE) + insn_buf[i++] = random() & 0xff; + + return i; +} + +static void parse_args(int argc, char **argv) +{ + int c; + char *tmp = NULL; + int set_seed = 0; + + prog = argv[0]; + while ((c = getopt(argc, argv, "ynvs:m:i:")) != -1) { + switch (c) { + case 'y': + x86_64 = 1; + break; + case 'n': + x86_64 = 0; + break; + case 'v': + verbose++; + break; + case 'i': + if (strcmp("-", optarg) == 0) + input_file = stdin; + else + input_file = fopen(optarg, "r"); + if (!input_file) + usage("Failed to open input file"); + break; + case 's': + seed = (unsigned int)strtoul(optarg, &tmp, 0); + if (*tmp == ',') { + optarg = tmp + 1; + iter_start = strtoul(optarg, &tmp, 0); + } + if (*tmp != '\0' || tmp == optarg) + usage("Failed to parse seed"); + set_seed = 1; + break; + case 'm': + iter_end = strtoul(optarg, &tmp, 0); + if (*tmp != '\0' || tmp == optarg) + usage("Failed to parse max_iter"); + break; + default: + usage(NULL); + } + } + + /* Check errors */ + if (iter_end < iter_start) + usage("Max iteration number must be bigger than iter-num"); + + if (set_seed && input_file) + usage("Don't use input file (-i) with random seed (-s)"); + + /* Initialize random seed */ + if (!input_file) { + if (!set_seed) /* No seed is given */ + init_random_seed(); + srand(seed); + } +} + +int main(int argc, char **argv) +{ + struct insn insn; + int insns = 0; + int errors = 0; + unsigned long i; + unsigned char insn_buf[MAX_INSN_SIZE * 2]; + + parse_args(argc, argv); + + /* Prepare stop bytes with NOPs */ + memset(insn_buf + MAX_INSN_SIZE, INSN_NOP, MAX_INSN_SIZE); + + for (i = 0; i < iter_end; i++) { + if (generate_insn(insn_buf) <= 0) + break; + + if (i < iter_start) /* Skip to given iteration number */ + continue; + + /* Decode an instruction */ + insn_init(&insn, insn_buf, x86_64); + insn_get_length(&insn); + + if (insn.next_byte <= insn.kaddr || + insn.kaddr + MAX_INSN_SIZE < insn.next_byte) { + /* Access out-of-range memory */ + dump_stream(stderr, "Error: Found an access violation", i, insn_buf, &insn); + errors++; + } else if (verbose && !insn_complete(&insn)) + dump_stream(stdout, "Info: Found an undecodable input", i, insn_buf, &insn); + else if (verbose >= 2) + dump_insn(stdout, &insn); + insns++; + } + + fprintf(stdout, "%s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", (errors) ? "Failure" : "Success", insns, (input_file) ? "given" : "random", errors, seed); + + return errors ? 1 : 0; +} diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1f928659c33..12eb07bfb26 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1215,8 +1215,6 @@ asmlinkage void __init xen_start_kernel(void) local_irq_disable(); early_boot_irqs_disabled = true; - memblock_init(); - xen_raw_console_write("mapping kernel into physical memory\n"); pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); xen_ident_map_ISA(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 87f6673b120..f4bf8aa574f 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1774,10 +1774,8 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, __xen_write_cr3(true, __pa(pgd)); xen_mc_issue(PARAVIRT_LAZY_CPU); - memblock_x86_reserve_range(__pa(xen_start_info->pt_base), - __pa(xen_start_info->pt_base + - xen_start_info->nr_pt_frames * PAGE_SIZE), - "XEN PAGETABLES"); + memblock_reserve(__pa(xen_start_info->pt_base), + xen_start_info->nr_pt_frames * PAGE_SIZE); return pgd; } @@ -1853,10 +1851,8 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, PFN_DOWN(__pa(initial_page_table))); xen_write_cr3(__pa(initial_page_table)); - memblock_x86_reserve_range(__pa(xen_start_info->pt_base), - __pa(xen_start_info->pt_base + - xen_start_info->nr_pt_frames * PAGE_SIZE), - "XEN PAGETABLES"); + memblock_reserve(__pa(xen_start_info->pt_base), + xen_start_info->nr_pt_frames * PAGE_SIZE)); return initial_page_table; } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index b2c7179fa26..e03c6369217 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -75,7 +75,7 @@ static void __init xen_add_extra_mem(u64 start, u64 size) if (i == XEN_EXTRA_MEM_MAX_REGIONS) printk(KERN_WARNING "Warning: not enough extra memory regions\n"); - memblock_x86_reserve_range(start, start + size, "XEN EXTRA"); + memblock_reserve(start, size); xen_max_p2m_pfn = PFN_DOWN(start + size); @@ -311,9 +311,8 @@ char * __init xen_memory_setup(void) * - xen_start_info * See comment above "struct start_info" in <xen/interface/xen.h> */ - memblock_x86_reserve_range(__pa(xen_start_info->mfn_list), - __pa(xen_start_info->pt_base), - "XEN START INFO"); + memblock_reserve(__pa(xen_start_info->mfn_list), + xen_start_info->pt_base - xen_start_info->mfn_list); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c index f3e5eb43f71..ac62f9cf1e1 100644 --- a/arch/xtensa/kernel/time.c +++ b/arch/xtensa/kernel/time.c @@ -41,14 +41,6 @@ static struct clocksource ccount_clocksource = { .rating = 200, .read = ccount_read, .mask = CLOCKSOURCE_MASK(32), - /* - * With a shift of 22 the lower limit of the cpu clock is - * 1MHz, where NSEC_PER_CCOUNT is 1000 or a bit less than - * 2^10: Since we have 32 bits and the multiplicator can - * already take up as much as 10 bits, this leaves us with - * remaining upper 22 bits. - */ - .shift = 22, }; static irqreturn_t timer_interrupt(int irq, void *dev_id); @@ -66,10 +58,7 @@ void __init time_init(void) printk("%d.%02d MHz\n", (int)ccount_per_jiffy/(1000000/HZ), (int)(ccount_per_jiffy/(10000/HZ))%100); #endif - ccount_clocksource.mult = - clocksource_hz2mult(CCOUNT_PER_JIFFY * HZ, - ccount_clocksource.shift); - clocksource_register(&ccount_clocksource); + clocksource_register_hz(&ccount_clocksource, CCOUNT_PER_JIFFY * HZ); /* Initialize the linux timer interrupt. */ diff --git a/block/ioctl.c b/block/ioctl.c index ca939fc1030..d510c2a4eff 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -180,6 +180,26 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); /* + * Is it an unrecognized ioctl? The correct returns are either + * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a + * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl + * code before returning. + * + * Confused drivers sometimes return EINVAL, which is wrong. It + * means "I understood the ioctl command, but the parameters to + * it were wrong". + * + * We should aim to just fix the broken drivers, the EINVAL case + * should go away. + */ +static inline int is_unrecognized_ioctl(int ret) +{ + return ret == -EINVAL || + ret == -ENOTTY || + ret == -ENOIOCTLCMD; +} + +/* * always keep this in sync with compat_blkdev_ioctl() */ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, @@ -196,8 +216,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return -EACCES; ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - /* -EINVAL to handle old uncorrected drivers */ - if (ret != -EINVAL && ret != -ENOTTY) + if (!is_unrecognized_ioctl(ret)) return ret; fsync_bdev(bdev); @@ -206,8 +225,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKROSET: ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - /* -EINVAL to handle old uncorrected drivers */ - if (ret != -EINVAL && ret != -ENOTTY) + if (!is_unrecognized_ioctl(ret)) return ret; if (!capable(CAP_SYS_ADMIN)) return -EACCES; diff --git a/drivers/char/random.c b/drivers/char/random.c index 6035ab8d5ef..85da8740586 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -624,8 +624,8 @@ static struct timer_rand_state input_timer_state; static void add_timer_randomness(struct timer_rand_state *state, unsigned num) { struct { - cycles_t cycles; long jiffies; + unsigned cycles; unsigned num; } sample; long delta, delta2, delta3; @@ -637,7 +637,11 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num) goto out; sample.jiffies = jiffies; - sample.cycles = get_cycles(); + + /* Use arch random value, fall back to cycles */ + if (!arch_get_random_int(&sample.cycles)) + sample.cycles = get_cycles(); + sample.num = num; mix_pool_bytes(&input_pool, &sample, sizeof(sample)); diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c index effe7974aa9..6b5cf02c35c 100644 --- a/drivers/clocksource/acpi_pm.c +++ b/drivers/clocksource/acpi_pm.c @@ -143,7 +143,7 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE, #ifndef CONFIG_X86_64 #include <asm/mach_timer.h> #define PMTMR_EXPECTED_RATE \ - ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10)) + ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (PIT_TICK_RATE>>10)) /* * Some boards have the PMTMR running way too fast. We check * the PMTMR rate against PIT channel 2 to catch these cases. diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c index 27c49e60b7d..e7cab2da910 100644 --- a/drivers/clocksource/i8253.c +++ b/drivers/clocksource/i8253.c @@ -53,7 +53,7 @@ static cycle_t i8253_read(struct clocksource *cs) count |= inb_p(PIT_CH0) << 8; /* VIA686a test code... reset the latch if count > max + 1 */ - if (count > LATCH) { + if (count > PIT_LATCH) { outb_p(0x34, PIT_MODE); outb_p(PIT_LATCH & 0xff, PIT_CH0); outb_p(PIT_LATCH >> 8, PIT_CH0); @@ -114,8 +114,8 @@ static void init_pit_timer(enum clock_event_mode mode, case CLOCK_EVT_MODE_PERIODIC: /* binary, mode 2, LSB/MSB, ch 0 */ outb_p(0x34, PIT_MODE); - outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - outb_p(LATCH >> 8 , PIT_CH0); /* MSB */ + outb_p(PIT_LATCH & 0xff , PIT_CH0); /* LSB */ + outb_p(PIT_LATCH >> 8 , PIT_CH0); /* MSB */ break; case CLOCK_EVT_MODE_SHUTDOWN: diff --git a/drivers/clocksource/tcb_clksrc.c b/drivers/clocksource/tcb_clksrc.c index 79c47e88d5d..55d0f95f82f 100644 --- a/drivers/clocksource/tcb_clksrc.c +++ b/drivers/clocksource/tcb_clksrc.c @@ -59,7 +59,6 @@ static struct clocksource clksrc = { .rating = 200, .read = tc_get_cycles, .mask = CLOCKSOURCE_MASK(32), - .shift = 18, .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; @@ -256,7 +255,6 @@ static int __init tcb_clksrc_init(void) best_divisor_idx = i; } - clksrc.mult = clocksource_hz2mult(divided_rate, clksrc.shift); printk(bootinfo, clksrc.name, CONFIG_ATMEL_TCB_CLKSRC_BLOCK, divided_rate / 1000000, @@ -292,7 +290,7 @@ static int __init tcb_clksrc_init(void) __raw_writel(ATMEL_TC_SYNC, tcaddr + ATMEL_TC_BCR); /* and away we go! */ - clocksource_register(&clksrc); + clocksource_register_hz(&clksrc, divided_rate); /* channel 2: periodic and oneshot timer support */ setup_clkevents(tc, clk32k_divisor_idx); diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index c97b468ee9f..235a340e81f 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -95,27 +95,26 @@ static struct dbs_tuners { .freq_step = 5, }; -static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, - cputime64_t *wall) +static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall) { - cputime64_t idle_time; - cputime64_t cur_wall_time; - cputime64_t busy_time; + u64 idle_time; + u64 cur_wall_time; + u64 busy_time; cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); - busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user, - kstat_cpu(cpu).cpustat.system); - busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq); - busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq); - busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal); - busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice); + busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER]; + busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM]; + busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ]; + busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ]; + busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; + busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; - idle_time = cputime64_sub(cur_wall_time, busy_time); + idle_time = cur_wall_time - busy_time; if (wall) - *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time); + *wall = jiffies_to_usecs(cur_wall_time); - return (cputime64_t)jiffies_to_usecs(idle_time); + return jiffies_to_usecs(idle_time); } static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) @@ -272,7 +271,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b, dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) - dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; + dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; } return count; } @@ -353,20 +352,20 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); - wall_time = (unsigned int) cputime64_sub(cur_wall_time, - j_dbs_info->prev_cpu_wall); + wall_time = (unsigned int) + (cur_wall_time - j_dbs_info->prev_cpu_wall); j_dbs_info->prev_cpu_wall = cur_wall_time; - idle_time = (unsigned int) cputime64_sub(cur_idle_time, - j_dbs_info->prev_cpu_idle); + idle_time = (unsigned int) + (cur_idle_time - j_dbs_info->prev_cpu_idle); j_dbs_info->prev_cpu_idle = cur_idle_time; if (dbs_tuners_ins.ignore_nice) { - cputime64_t cur_nice; + u64 cur_nice; unsigned long cur_nice_jiffies; - cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice, - j_dbs_info->prev_cpu_nice); + cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] - + j_dbs_info->prev_cpu_nice; /* * Assumption: nice time between sampling periods will * be less than 2^32 jiffies for 32 bit sys @@ -374,7 +373,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) cur_nice_jiffies = (unsigned long) cputime64_to_jiffies64(cur_nice); - j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; + j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; idle_time += jiffies_to_usecs(cur_nice_jiffies); } @@ -501,10 +500,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &j_dbs_info->prev_cpu_wall); - if (dbs_tuners_ins.ignore_nice) { + if (dbs_tuners_ins.ignore_nice) j_dbs_info->prev_cpu_nice = - kstat_cpu(j).cpustat.nice; - } + kcpustat_cpu(j).cpustat[CPUTIME_NICE]; } this_dbs_info->down_skip = 0; this_dbs_info->requested_freq = policy->cur; diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c index fa8af4ebb1d..3d679eee70a 100644 --- a/drivers/cpufreq/cpufreq_ondemand.c +++ b/drivers/cpufreq/cpufreq_ondemand.c @@ -119,27 +119,26 @@ static struct dbs_tuners { .powersave_bias = 0, }; -static inline cputime64_t get_cpu_idle_time_jiffy(unsigned int cpu, - cputime64_t *wall) +static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall) { - cputime64_t idle_time; - cputime64_t cur_wall_time; - cputime64_t busy_time; + u64 idle_time; + u64 cur_wall_time; + u64 busy_time; cur_wall_time = jiffies64_to_cputime64(get_jiffies_64()); - busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user, - kstat_cpu(cpu).cpustat.system); - busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq); - busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq); - busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal); - busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.nice); + busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER]; + busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM]; + busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ]; + busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ]; + busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; + busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; - idle_time = cputime64_sub(cur_wall_time, busy_time); + idle_time = cur_wall_time - busy_time; if (wall) - *wall = (cputime64_t)jiffies_to_usecs(cur_wall_time); + *wall = jiffies_to_usecs(cur_wall_time); - return (cputime64_t)jiffies_to_usecs(idle_time); + return jiffies_to_usecs(idle_time); } static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall) @@ -345,7 +344,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b, dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &dbs_info->prev_cpu_wall); if (dbs_tuners_ins.ignore_nice) - dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; + dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; } return count; @@ -442,24 +441,24 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) cur_idle_time = get_cpu_idle_time(j, &cur_wall_time); cur_iowait_time = get_cpu_iowait_time(j, &cur_wall_time); - wall_time = (unsigned int) cputime64_sub(cur_wall_time, - j_dbs_info->prev_cpu_wall); + wall_time = (unsigned int) + (cur_wall_time - j_dbs_info->prev_cpu_wall); j_dbs_info->prev_cpu_wall = cur_wall_time; - idle_time = (unsigned int) cputime64_sub(cur_idle_time, - j_dbs_info->prev_cpu_idle); + idle_time = (unsigned int) + (cur_idle_time - j_dbs_info->prev_cpu_idle); j_dbs_info->prev_cpu_idle = cur_idle_time; - iowait_time = (unsigned int) cputime64_sub(cur_iowait_time, - j_dbs_info->prev_cpu_iowait); + iowait_time = (unsigned int) + (cur_iowait_time - j_dbs_info->prev_cpu_iowait); j_dbs_info->prev_cpu_iowait = cur_iowait_time; if (dbs_tuners_ins.ignore_nice) { - cputime64_t cur_nice; + u64 cur_nice; unsigned long cur_nice_jiffies; - cur_nice = cputime64_sub(kstat_cpu(j).cpustat.nice, - j_dbs_info->prev_cpu_nice); + cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE] - + j_dbs_info->prev_cpu_nice; /* * Assumption: nice time between sampling periods will * be less than 2^32 jiffies for 32 bit sys @@ -467,7 +466,7 @@ static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info) cur_nice_jiffies = (unsigned long) cputime64_to_jiffies64(cur_nice); - j_dbs_info->prev_cpu_nice = kstat_cpu(j).cpustat.nice; + j_dbs_info->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE]; idle_time += jiffies_to_usecs(cur_nice_jiffies); } @@ -646,10 +645,9 @@ static int cpufreq_governor_dbs(struct cpufreq_policy *policy, j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j, &j_dbs_info->prev_cpu_wall); - if (dbs_tuners_ins.ignore_nice) { + if (dbs_tuners_ins.ignore_nice) j_dbs_info->prev_cpu_nice = - kstat_cpu(j).cpustat.nice; - } + kcpustat_cpu(j).cpustat[CPUTIME_NICE]; } this_dbs_info->cpu = cpu; this_dbs_info->rate_mult = 1; diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c index c5072a91e84..2a508edd768 100644 --- a/drivers/cpufreq/cpufreq_stats.c +++ b/drivers/cpufreq/cpufreq_stats.c @@ -61,9 +61,8 @@ static int cpufreq_stats_update(unsigned int cpu) spin_lock(&cpufreq_stats_lock); stat = per_cpu(cpufreq_stats_table, cpu); if (stat->time_in_state) - stat->time_in_state[stat->last_index] = - cputime64_add(stat->time_in_state[stat->last_index], - cputime_sub(cur_time, stat->last_time)); + stat->time_in_state[stat->last_index] += + cur_time - stat->last_time; stat->last_time = cur_time; spin_unlock(&cpufreq_stats_lock); return 0; diff --git a/drivers/edac/i7core_edac.c b/drivers/edac/i7core_edac.c index 70ad8923f1d..8568d9b6187 100644 --- a/drivers/edac/i7core_edac.c +++ b/drivers/edac/i7core_edac.c @@ -2234,7 +2234,7 @@ static void i7core_unregister_mci(struct i7core_dev *i7core_dev) if (pvt->enable_scrub) disable_sdram_scrub_setting(mci); - atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &i7_mce_dec); + mce_unregister_decode_chain(&i7_mce_dec); /* Disable EDAC polling */ i7core_pci_ctl_release(pvt); @@ -2336,7 +2336,7 @@ static int i7core_register_mci(struct i7core_dev *i7core_dev) /* DCLK for scrub rate setting */ pvt->dclk_freq = get_dclk_freq(); - atomic_notifier_chain_register(&x86_mce_decoder_chain, &i7_mce_dec); + mce_register_decode_chain(&i7_mce_dec); return 0; diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index d0864d9c38a..bd926ea2e00 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -884,7 +884,7 @@ static int __init mce_amd_init(void) pr_info("MCE: In-kernel MCE decoding enabled.\n"); - atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb); + mce_register_decode_chain(&amd_mce_dec_nb); return 0; } @@ -893,7 +893,7 @@ early_initcall(mce_amd_init); #ifdef MODULE static void __exit mce_amd_exit(void) { - atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb); + mce_unregister_decode_chain(&amd_mce_dec_nb); kfree(fam_ops); } diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 7a402bfbee7..1dc118d83cc 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -1609,11 +1609,9 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val, mce->cpuvendor, mce->cpuid, mce->time, mce->socketid, mce->apicid); -#ifdef CONFIG_SMP /* Only handle if it is the right mc controller */ if (cpu_data(mce->cpu).phys_proc_id != pvt->sbridge_dev->mc) return NOTIFY_DONE; -#endif smp_rmb(); if ((pvt->mce_out + 1) % MCE_LOG_LEN == pvt->mce_in) { @@ -1661,8 +1659,7 @@ static void sbridge_unregister_mci(struct sbridge_dev *sbridge_dev) debugf0("MC: " __FILE__ ": %s(): mci = %p, dev = %p\n", __func__, mci, &sbridge_dev->pdev[0]->dev); - atomic_notifier_chain_unregister(&x86_mce_decoder_chain, - &sbridge_mce_dec); + mce_unregister_decode_chain(&sbridge_mce_dec); /* Remove MC sysfs nodes */ edac_mc_del_mc(mci->dev); @@ -1731,8 +1728,7 @@ static int sbridge_register_mci(struct sbridge_dev *sbridge_dev) goto fail0; } - atomic_notifier_chain_register(&x86_mce_decoder_chain, - &sbridge_mce_dec); + mce_register_decode_chain(&sbridge_mce_dec); return 0; fail0: diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c index 104b3767516..1fdef885341 100644 --- a/drivers/hwmon/coretemp.c +++ b/drivers/hwmon/coretemp.c @@ -57,16 +57,15 @@ MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius"); #define TOTAL_ATTRS (MAX_CORE_ATTRS + 1) #define MAX_CORE_DATA (NUM_REAL_CORES + BASE_SYSFS_ATTR_NO) -#ifdef CONFIG_SMP #define TO_PHYS_ID(cpu) cpu_data(cpu).phys_proc_id #define TO_CORE_ID(cpu) cpu_data(cpu).cpu_core_id +#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO) + +#ifdef CONFIG_SMP #define for_each_sibling(i, cpu) for_each_cpu(i, cpu_sibling_mask(cpu)) #else -#define TO_PHYS_ID(cpu) (cpu) -#define TO_CORE_ID(cpu) (cpu) #define for_each_sibling(i, cpu) for (i = 0; false; ) #endif -#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO) /* * Per-Core Temperature Data diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index bdc447fd476..31053a951c3 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -41,6 +41,7 @@ #include <linux/tboot.h> #include <linux/dmi.h> #include <linux/pci-ats.h> +#include <linux/memblock.h> #include <asm/cacheflush.h> #include <asm/iommu.h> @@ -2188,18 +2189,6 @@ static inline void iommu_prepare_isa(void) static int md_domain_init(struct dmar_domain *domain, int guest_width); -static int __init si_domain_work_fn(unsigned long start_pfn, - unsigned long end_pfn, void *datax) -{ - int *ret = datax; - - *ret = iommu_domain_identity_map(si_domain, - (uint64_t)start_pfn << PAGE_SHIFT, - (uint64_t)end_pfn << PAGE_SHIFT); - return *ret; - -} - static int __init si_domain_init(int hw) { struct dmar_drhd_unit *drhd; @@ -2231,9 +2220,15 @@ static int __init si_domain_init(int hw) return 0; for_each_online_node(nid) { - work_with_active_regions(nid, si_domain_work_fn, &ret); - if (ret) - return ret; + unsigned long start_pfn, end_pfn; + int i; + + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { + ret = iommu_domain_identity_map(si_domain, + PFN_PHYS(start_pfn), PFN_PHYS(end_pfn)); + if (ret) + return ret; + } } return 0; diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 65af42f2d59..39809035320 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -697,7 +697,7 @@ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) * interrupts are enabled. We always leave interrupts enabled while * running the Guest. */ - regs->eflags = X86_EFLAGS_IF | 0x2; + regs->eflags = X86_EFLAGS_IF | X86_EFLAGS_BIT1; /* * The "Extended Instruction Pointer" register says where the Guest is diff --git a/drivers/macintosh/rack-meter.c b/drivers/macintosh/rack-meter.c index 2637c139777..6dc26b61219 100644 --- a/drivers/macintosh/rack-meter.c +++ b/drivers/macintosh/rack-meter.c @@ -81,13 +81,13 @@ static int rackmeter_ignore_nice; */ static inline cputime64_t get_cpu_idle_time(unsigned int cpu) { - cputime64_t retval; + u64 retval; - retval = cputime64_add(kstat_cpu(cpu).cpustat.idle, - kstat_cpu(cpu).cpustat.iowait); + retval = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE] + + kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; if (rackmeter_ignore_nice) - retval = cputime64_add(retval, kstat_cpu(cpu).cpustat.nice); + retval += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE]; return retval; } @@ -220,13 +220,11 @@ static void rackmeter_do_timer(struct work_struct *work) int i, offset, load, cumm, pause; cur_jiffies = jiffies64_to_cputime64(get_jiffies_64()); - total_ticks = (unsigned int)cputime64_sub(cur_jiffies, - rcpu->prev_wall); + total_ticks = (unsigned int) (cur_jiffies - rcpu->prev_wall); rcpu->prev_wall = cur_jiffies; total_idle_ticks = get_cpu_idle_time(cpu); - idle_ticks = (unsigned int) cputime64_sub(total_idle_ticks, - rcpu->prev_idle); + idle_ticks = (unsigned int) (total_idle_ticks - rcpu->prev_idle); rcpu->prev_idle = total_idle_ticks; /* We do a very dumb calculation to update the LEDs for now, diff --git a/drivers/oprofile/nmi_timer_int.c b/drivers/oprofile/nmi_timer_int.c new file mode 100644 index 00000000000..76f1c9357f3 --- /dev/null +++ b/drivers/oprofile/nmi_timer_int.c @@ -0,0 +1,173 @@ +/** + * @file nmi_timer_int.c + * + * @remark Copyright 2011 Advanced Micro Devices, Inc. + * + * @author Robert Richter <robert.richter@amd.com> + */ + +#include <linux/init.h> +#include <linux/smp.h> +#include <linux/errno.h> +#include <linux/oprofile.h> +#include <linux/perf_event.h> + +#ifdef CONFIG_OPROFILE_NMI_TIMER + +static DEFINE_PER_CPU(struct perf_event *, nmi_timer_events); +static int ctr_running; + +static struct perf_event_attr nmi_timer_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + +static void nmi_timer_callback(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + event->hw.interrupts = 0; /* don't throttle interrupts */ + oprofile_add_sample(regs, 0); +} + +static int nmi_timer_start_cpu(int cpu) +{ + struct perf_event *event = per_cpu(nmi_timer_events, cpu); + + if (!event) { + event = perf_event_create_kernel_counter(&nmi_timer_attr, cpu, NULL, + nmi_timer_callback, NULL); + if (IS_ERR(event)) + return PTR_ERR(event); + per_cpu(nmi_timer_events, cpu) = event; + } + + if (event && ctr_running) + perf_event_enable(event); + + return 0; +} + +static void nmi_timer_stop_cpu(int cpu) +{ + struct perf_event *event = per_cpu(nmi_timer_events, cpu); + + if (event && ctr_running) + perf_event_disable(event); +} + +static int nmi_timer_cpu_notifier(struct notifier_block *b, unsigned long action, + void *data) +{ + int cpu = (unsigned long)data; + switch (action) { + case CPU_DOWN_FAILED: + case CPU_ONLINE: + nmi_timer_start_cpu(cpu); + break; + case CPU_DOWN_PREPARE: + nmi_timer_stop_cpu(cpu); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block nmi_timer_cpu_nb = { + .notifier_call = nmi_timer_cpu_notifier +}; + +static int nmi_timer_start(void) +{ + int cpu; + + get_online_cpus(); + ctr_running = 1; + for_each_online_cpu(cpu) + nmi_timer_start_cpu(cpu); + put_online_cpus(); + + return 0; +} + +static void nmi_timer_stop(void) +{ + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + nmi_timer_stop_cpu(cpu); + ctr_running = 0; + put_online_cpus(); +} + +static void nmi_timer_shutdown(void) +{ + struct perf_event *event; + int cpu; + + get_online_cpus(); + unregister_cpu_notifier(&nmi_timer_cpu_nb); + for_each_possible_cpu(cpu) { + event = per_cpu(nmi_timer_events, cpu); + if (!event) + continue; + perf_event_disable(event); + per_cpu(nmi_timer_events, cpu) = NULL; + perf_event_release_kernel(event); + } + + put_online_cpus(); +} + +static int nmi_timer_setup(void) +{ + int cpu, err; + u64 period; + + /* clock cycles per tick: */ + period = (u64)cpu_khz * 1000; + do_div(period, HZ); + nmi_timer_attr.sample_period = period; + + get_online_cpus(); + err = register_cpu_notifier(&nmi_timer_cpu_nb); + if (err) + goto out; + /* can't attach events to offline cpus: */ + for_each_online_cpu(cpu) { + err = nmi_timer_start_cpu(cpu); + if (err) + break; + } + if (err) + nmi_timer_shutdown(); +out: + put_online_cpus(); + return err; +} + +int __init op_nmi_timer_init(struct oprofile_operations *ops) +{ + int err = 0; + + err = nmi_timer_setup(); + if (err) + return err; + nmi_timer_shutdown(); /* only check, don't alloc */ + + ops->create_files = NULL; + ops->setup = nmi_timer_setup; + ops->shutdown = nmi_timer_shutdown; + ops->start = nmi_timer_start; + ops->stop = nmi_timer_stop; + ops->cpu_type = "timer"; + + printk(KERN_INFO "oprofile: using NMI timer interrupt.\n"); + + return 0; +} + +#endif diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index f8c752e408a..ed2c3ec0702 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -246,37 +246,31 @@ static int __init oprofile_init(void) int err; /* always init architecture to setup backtrace support */ + timer_mode = 0; err = oprofile_arch_init(&oprofile_ops); + if (!err) { + if (!timer && !oprofilefs_register()) + return 0; + oprofile_arch_exit(); + } - timer_mode = err || timer; /* fall back to timer mode on errors */ - if (timer_mode) { - if (!err) - oprofile_arch_exit(); + /* setup timer mode: */ + timer_mode = 1; + /* no nmi timer mode if oprofile.timer is set */ + if (timer || op_nmi_timer_init(&oprofile_ops)) { err = oprofile_timer_init(&oprofile_ops); if (err) return err; } - err = oprofilefs_register(); - if (!err) - return 0; - - /* failed */ - if (timer_mode) - oprofile_timer_exit(); - else - oprofile_arch_exit(); - - return err; + return oprofilefs_register(); } static void __exit oprofile_exit(void) { oprofilefs_unregister(); - if (timer_mode) - oprofile_timer_exit(); - else + if (!timer_mode) oprofile_arch_exit(); } diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h index 177b73de5e5..d32ef816337 100644 --- a/drivers/oprofile/oprof.h +++ b/drivers/oprofile/oprof.h @@ -35,7 +35,15 @@ struct dentry; void oprofile_create_files(struct super_block *sb, struct dentry *root); int oprofile_timer_init(struct oprofile_operations *ops); -void oprofile_timer_exit(void); +#ifdef CONFIG_OPROFILE_NMI_TIMER +int op_nmi_timer_init(struct oprofile_operations *ops); +#else +static inline int op_nmi_timer_init(struct oprofile_operations *ops) +{ + return -ENODEV; +} +#endif + int oprofile_set_ulong(unsigned long *addr, unsigned long val); int oprofile_set_timeout(unsigned long time); diff --git a/drivers/oprofile/timer_int.c b/drivers/oprofile/timer_int.c index 878fba12658..93404f72dfa 100644 --- a/drivers/oprofile/timer_int.c +++ b/drivers/oprofile/timer_int.c @@ -97,24 +97,24 @@ static struct notifier_block __refdata oprofile_cpu_notifier = { .notifier_call = oprofile_cpu_notify, }; -int oprofile_timer_init(struct oprofile_operations *ops) +static int oprofile_hrtimer_setup(void) { - int rc; - - rc = register_hotcpu_notifier(&oprofile_cpu_notifier); - if (rc) - return rc; - ops->create_files = NULL; - ops->setup = NULL; - ops->shutdown = NULL; - ops->start = oprofile_hrtimer_start; - ops->stop = oprofile_hrtimer_stop; - ops->cpu_type = "timer"; - printk(KERN_INFO "oprofile: using timer interrupt.\n"); - return 0; + return register_hotcpu_notifier(&oprofile_cpu_notifier); } -void oprofile_timer_exit(void) +static void oprofile_hrtimer_shutdown(void) { unregister_hotcpu_notifier(&oprofile_cpu_notifier); } + +int oprofile_timer_init(struct oprofile_operations *ops) +{ + ops->create_files = NULL; + ops->setup = oprofile_hrtimer_setup; + ops->shutdown = oprofile_hrtimer_shutdown; + ops->start = oprofile_hrtimer_start; + ops->stop = oprofile_hrtimer_stop; + ops->cpu_type = "timer"; + printk(KERN_INFO "oprofile: using timer interrupt.\n"); + return 0; +} diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index f02b5235056..37856f7c778 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -98,11 +98,11 @@ config PCI_PASID If unsure, say N. config PCI_IOAPIC - bool + tristate "PCI IO-APIC hotplug support" if X86 depends on PCI depends on ACPI depends on HOTPLUG - default y + default !X86 config PCI_LABEL def_bool y if (DMI || ACPI) diff --git a/drivers/pci/ioapic.c b/drivers/pci/ioapic.c index 5775638ac01..205af8dc83c 100644 --- a/drivers/pci/ioapic.c +++ b/drivers/pci/ioapic.c @@ -17,7 +17,7 @@ */ #include <linux/pci.h> -#include <linux/export.h> +#include <linux/module.h> #include <linux/acpi.h> #include <linux/slab.h> #include <acpi/acpi_bus.h> @@ -27,7 +27,7 @@ struct ioapic { u32 gsi_base; }; -static int ioapic_probe(struct pci_dev *dev, const struct pci_device_id *ent) +static int __devinit ioapic_probe(struct pci_dev *dev, const struct pci_device_id *ent) { acpi_handle handle; acpi_status status; @@ -88,7 +88,7 @@ exit_free: return -ENODEV; } -static void ioapic_remove(struct pci_dev *dev) +static void __devexit ioapic_remove(struct pci_dev *dev) { struct ioapic *ioapic = pci_get_drvdata(dev); @@ -99,13 +99,12 @@ static void ioapic_remove(struct pci_dev *dev) } -static struct pci_device_id ioapic_devices[] = { - { PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, - PCI_CLASS_SYSTEM_PIC_IOAPIC << 8, 0xffff00, }, - { PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, - PCI_CLASS_SYSTEM_PIC_IOXAPIC << 8, 0xffff00, }, +static DEFINE_PCI_DEVICE_TABLE(ioapic_devices) = { + { PCI_DEVICE_CLASS(PCI_CLASS_SYSTEM_PIC_IOAPIC, ~0) }, + { PCI_DEVICE_CLASS(PCI_CLASS_SYSTEM_PIC_IOXAPIC, ~0) }, { } }; +MODULE_DEVICE_TABLE(pci, ioapic_devices); static struct pci_driver ioapic_driver = { .name = "ioapic", diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 51352de88ef..a10e428b32b 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1506,35 +1506,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd, return -ENOIOCTLCMD; } -static void compat_ioctl_error(struct file *filp, unsigned int fd, - unsigned int cmd, unsigned long arg) -{ - char buf[10]; - char *fn = "?"; - char *path; - - /* find the name of the device. */ - path = (char *)__get_free_page(GFP_KERNEL); - if (path) { - fn = d_path(&filp->f_path, path, PAGE_SIZE); - if (IS_ERR(fn)) - fn = "?"; - } - - sprintf(buf,"'%c'", (cmd>>_IOC_TYPESHIFT) & _IOC_TYPEMASK); - if (!isprint(buf[1])) - sprintf(buf, "%02x", buf[1]); - compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) " - "cmd(%08x){t:%s;sz:%u} arg(%08x) on %s\n", - current->comm, current->pid, - (int)fd, (unsigned int)cmd, buf, - (cmd >> _IOC_SIZESHIFT) & _IOC_SIZEMASK, - (unsigned int)arg, fn); - - if (path) - free_page((unsigned long)path); -} - static int compat_ioctl_check_table(unsigned int xcmd) { int i; @@ -1621,13 +1592,8 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd, goto found_handler; error = do_ioctl_trans(fd, cmd, arg, filp); - if (error == -ENOIOCTLCMD) { - static int count; - - if (++count <= 50) - compat_ioctl_error(filp, fd, cmd, arg); - error = -EINVAL; - } + if (error == -ENOIOCTLCMD) + error = -ENOTTY; goto out_fput; diff --git a/fs/ioctl.c b/fs/ioctl.c index 1d9b9fcb2db..066836e8184 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -42,7 +42,7 @@ static long vfs_ioctl(struct file *filp, unsigned int cmd, error = filp->f_op->unlocked_ioctl(filp, cmd, arg); if (error == -ENOIOCTLCMD) - error = -EINVAL; + error = -ENOTTY; out: return error; } diff --git a/fs/proc/array.c b/fs/proc/array.c index 3a1dafd228d..8c344f037bd 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -394,8 +394,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, sigemptyset(&sigign); sigemptyset(&sigcatch); - cutime = cstime = utime = stime = cputime_zero; - cgtime = gtime = cputime_zero; + cutime = cstime = utime = stime = 0; + cgtime = gtime = 0; if (lock_task_sighand(task, &flags)) { struct signal_struct *sig = task->signal; @@ -423,14 +423,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, do { min_flt += t->min_flt; maj_flt += t->maj_flt; - gtime = cputime_add(gtime, t->gtime); + gtime += t->gtime; t = next_thread(t); } while (t != task); min_flt += sig->min_flt; maj_flt += sig->maj_flt; thread_group_times(task, &utime, &stime); - gtime = cputime_add(gtime, sig->gtime); + gtime += sig->gtime; } sid = task_session_nr_ns(task, ns); diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 0855e6f2039..d76ca6ae2b1 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -22,29 +22,27 @@ #define arch_idle_time(cpu) 0 #endif -static cputime64_t get_idle_time(int cpu) +static u64 get_idle_time(int cpu) { - u64 idle_time = get_cpu_idle_time_us(cpu, NULL); - cputime64_t idle; + u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL); if (idle_time == -1ULL) { /* !NO_HZ so we can rely on cpustat.idle */ - idle = kstat_cpu(cpu).cpustat.idle; - idle = cputime64_add(idle, arch_idle_time(cpu)); + idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + idle += arch_idle_time(cpu); } else idle = usecs_to_cputime64(idle_time); return idle; } -static cputime64_t get_iowait_time(int cpu) +static u64 get_iowait_time(int cpu) { - u64 iowait_time = get_cpu_iowait_time_us(cpu, NULL); - cputime64_t iowait; + u64 iowait, iowait_time = get_cpu_iowait_time_us(cpu, NULL); if (iowait_time == -1ULL) /* !NO_HZ so we can rely on cpustat.iowait */ - iowait = kstat_cpu(cpu).cpustat.iowait; + iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; else iowait = usecs_to_cputime64(iowait_time); @@ -55,33 +53,30 @@ static int show_stat(struct seq_file *p, void *v) { int i, j; unsigned long jif; - cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; - cputime64_t guest, guest_nice; + u64 user, nice, system, idle, iowait, irq, softirq, steal; + u64 guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec boottime; user = nice = system = idle = iowait = - irq = softirq = steal = cputime64_zero; - guest = guest_nice = cputime64_zero; + irq = softirq = steal = 0; + guest = guest_nice = 0; getboottime(&boottime); jif = boottime.tv_sec; for_each_possible_cpu(i) { - user = cputime64_add(user, kstat_cpu(i).cpustat.user); - nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); - system = cputime64_add(system, kstat_cpu(i).cpustat.system); - idle = cputime64_add(idle, get_idle_time(i)); - iowait = cputime64_add(iowait, get_iowait_time(i)); - irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq); - softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); - steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); - guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); - guest_nice = cputime64_add(guest_nice, - kstat_cpu(i).cpustat.guest_nice); - sum += kstat_cpu_irqs_sum(i); - sum += arch_irq_stat_cpu(i); + user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; + nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; + system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; + idle += get_idle_time(i); + iowait += get_iowait_time(i); + irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; + softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; + steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; + guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; + guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; for (j = 0; j < NR_SOFTIRQS; j++) { unsigned int softirq_stat = kstat_softirqs_cpu(j, i); @@ -106,16 +101,16 @@ static int show_stat(struct seq_file *p, void *v) (unsigned long long)cputime64_to_clock_t(guest_nice)); for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kstat_cpu(i).cpustat.user; - nice = kstat_cpu(i).cpustat.nice; - system = kstat_cpu(i).cpustat.system; + user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; + nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; + system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; idle = get_idle_time(i); iowait = get_iowait_time(i); - irq = kstat_cpu(i).cpustat.irq; - softirq = kstat_cpu(i).cpustat.softirq; - steal = kstat_cpu(i).cpustat.steal; - guest = kstat_cpu(i).cpustat.guest; - guest_nice = kstat_cpu(i).cpustat.guest_nice; + irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; + softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; + steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; + guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; + guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu " "%llu\n", diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 766b1d45605..9610ac772d7 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -11,15 +11,20 @@ static int uptime_proc_show(struct seq_file *m, void *v) { struct timespec uptime; struct timespec idle; + u64 idletime; + u64 nsec; + u32 rem; int i; - cputime_t idletime = cputime_zero; + idletime = 0; for_each_possible_cpu(i) - idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle); + idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; do_posix_clock_monotonic_gettime(&uptime); monotonic_to_bootbased(&uptime); - cputime_to_timespec(idletime, &idle); + nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; + idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, (uptime.tv_nsec / (NSEC_PER_SEC / 100)), diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h index 12a1764f612..9a62937c56c 100644 --- a/include/asm-generic/cputime.h +++ b/include/asm-generic/cputime.h @@ -4,71 +4,66 @@ #include <linux/time.h> #include <linux/jiffies.h> -typedef unsigned long cputime_t; +typedef unsigned long __nocast cputime_t; -#define cputime_zero (0UL) #define cputime_one_jiffy jiffies_to_cputime(1) -#define cputime_max ((~0UL >> 1) - 1) -#define cputime_add(__a, __b) ((__a) + (__b)) -#define cputime_sub(__a, __b) ((__a) - (__b)) -#define cputime_div(__a, __n) ((__a) / (__n)) -#define cputime_halve(__a) ((__a) >> 1) -#define cputime_eq(__a, __b) ((__a) == (__b)) -#define cputime_gt(__a, __b) ((__a) > (__b)) -#define cputime_ge(__a, __b) ((__a) >= (__b)) -#define cputime_lt(__a, __b) ((__a) < (__b)) -#define cputime_le(__a, __b) ((__a) <= (__b)) -#define cputime_to_jiffies(__ct) (__ct) +#define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) #define cputime_to_scaled(__ct) (__ct) -#define jiffies_to_cputime(__hz) (__hz) +#define jiffies_to_cputime(__hz) (__force cputime_t)(__hz) -typedef u64 cputime64_t; +typedef u64 __nocast cputime64_t; -#define cputime64_zero (0ULL) -#define cputime64_add(__a, __b) ((__a) + (__b)) -#define cputime64_sub(__a, __b) ((__a) - (__b)) -#define cputime64_to_jiffies64(__ct) (__ct) -#define jiffies64_to_cputime64(__jif) (__jif) -#define cputime_to_cputime64(__ct) ((u64) __ct) -#define cputime64_gt(__a, __b) ((__a) > (__b)) +#define cputime64_to_jiffies64(__ct) (__force u64)(__ct) +#define jiffies64_to_cputime64(__jif) (__force cputime64_t)(__jif) -#define nsecs_to_cputime64(__ct) nsecs_to_jiffies64(__ct) +#define nsecs_to_cputime64(__ct) \ + jiffies64_to_cputime64(nsecs_to_jiffies64(__ct)) /* * Convert cputime to microseconds and back. */ -#define cputime_to_usecs(__ct) jiffies_to_usecs(__ct) -#define usecs_to_cputime(__msecs) usecs_to_jiffies(__msecs) -#define usecs_to_cputime64(__msecs) nsecs_to_jiffies64((__msecs) * 1000) +#define cputime_to_usecs(__ct) \ + jiffies_to_usecs(cputime_to_jiffies(__ct)) +#define usecs_to_cputime(__usec) \ + jiffies_to_cputime(usecs_to_jiffies(__usec)) +#define usecs_to_cputime64(__usec) \ + jiffies64_to_cputime64(nsecs_to_jiffies64((__usec) * 1000)) /* * Convert cputime to seconds and back. */ -#define cputime_to_secs(jif) ((jif) / HZ) -#define secs_to_cputime(sec) ((sec) * HZ) +#define cputime_to_secs(jif) (cputime_to_jiffies(jif) / HZ) +#define secs_to_cputime(sec) jiffies_to_cputime((sec) * HZ) /* * Convert cputime to timespec and back. */ -#define timespec_to_cputime(__val) timespec_to_jiffies(__val) -#define cputime_to_timespec(__ct,__val) jiffies_to_timespec(__ct,__val) +#define timespec_to_cputime(__val) \ + jiffies_to_cputime(timespec_to_jiffies(__val)) +#define cputime_to_timespec(__ct,__val) \ + jiffies_to_timespec(cputime_to_jiffies(__ct),__val) /* * Convert cputime to timeval and back. */ -#define timeval_to_cputime(__val) timeval_to_jiffies(__val) -#define cputime_to_timeval(__ct,__val) jiffies_to_timeval(__ct,__val) +#define timeval_to_cputime(__val) \ + jiffies_to_cputime(timeval_to_jiffies(__val)) +#define cputime_to_timeval(__ct,__val) \ + jiffies_to_timeval(cputime_to_jiffies(__ct),__val) /* * Convert cputime to clock and back. */ -#define cputime_to_clock_t(__ct) jiffies_to_clock_t(__ct) -#define clock_t_to_cputime(__x) clock_t_to_jiffies(__x) +#define cputime_to_clock_t(__ct) \ + jiffies_to_clock_t(cputime_to_jiffies(__ct)) +#define clock_t_to_cputime(__x) \ + jiffies_to_cputime(clock_t_to_jiffies(__x)) /* * Convert cputime64 to clock. */ -#define cputime64_to_clock_t(__ct) jiffies_64_to_clock_t(__ct) +#define cputime64_to_clock_t(__ct) \ + jiffies_64_to_clock_t(cputime64_to_jiffies64(__ct)) #endif diff --git a/include/linux/bitops.h b/include/linux/bitops.h index a3ef66a2a08..3c1063acb2a 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -22,8 +22,14 @@ extern unsigned long __sw_hweight64(__u64 w); #include <asm/bitops.h> #define for_each_set_bit(bit, addr, size) \ - for ((bit) = find_first_bit((addr), (size)); \ - (bit) < (size); \ + for ((bit) = find_first_bit((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_bit((addr), (size), (bit) + 1)) + +/* same as for_each_set_bit() but use bit as value to start with */ +#define for_each_set_bit_cont(bit, addr, size) \ + for ((bit) = find_next_bit((addr), (size), (bit)); \ + (bit) < (size); \ (bit) = find_next_bit((addr), (size), (bit) + 1)) static __inline__ int get_bitmask_order(unsigned int count) diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index ab344a52110..66d3e954eb6 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -44,7 +44,7 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat, unsigned long endpfn); extern unsigned long init_bootmem(unsigned long addr, unsigned long memend); -unsigned long free_all_memory_core_early(int nodeid); +extern unsigned long free_low_memory_core_early(int nodeid); extern unsigned long free_all_bootmem_node(pg_data_t *pgdat); extern unsigned long free_all_bootmem(void); diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h index 65970b811e2..0e5f5785d9f 100644 --- a/include/linux/debugobjects.h +++ b/include/linux/debugobjects.h @@ -46,6 +46,8 @@ struct debug_obj { * fails * @fixup_free: fixup function, which is called when the free check * fails + * @fixup_assert_init: fixup function, which is called when the assert_init + * check fails */ struct debug_obj_descr { const char *name; @@ -54,6 +56,7 @@ struct debug_obj_descr { int (*fixup_activate) (void *addr, enum debug_obj_state state); int (*fixup_destroy) (void *addr, enum debug_obj_state state); int (*fixup_free) (void *addr, enum debug_obj_state state); + int (*fixup_assert_init)(void *addr, enum debug_obj_state state); }; #ifdef CONFIG_DEBUG_OBJECTS @@ -64,6 +67,7 @@ extern void debug_object_activate (void *addr, struct debug_obj_descr *descr); extern void debug_object_deactivate(void *addr, struct debug_obj_descr *descr); extern void debug_object_destroy (void *addr, struct debug_obj_descr *descr); extern void debug_object_free (void *addr, struct debug_obj_descr *descr); +extern void debug_object_assert_init(void *addr, struct debug_obj_descr *descr); /* * Active state: @@ -89,6 +93,8 @@ static inline void debug_object_destroy (void *addr, struct debug_obj_descr *descr) { } static inline void debug_object_free (void *addr, struct debug_obj_descr *descr) { } +static inline void +debug_object_assert_init(void *addr, struct debug_obj_descr *descr) { } static inline void debug_objects_early_init(void) { } static inline void debug_objects_mem_init(void) { } diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 388b0d425b5..5ce8b140428 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -3,6 +3,7 @@ #include <linux/types.h> #include <linux/compiler.h> +#include <linux/workqueue.h> #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) @@ -14,6 +15,12 @@ struct jump_label_key { #endif }; +struct jump_label_key_deferred { + struct jump_label_key key; + unsigned long timeout; + struct delayed_work work; +}; + # include <asm/jump_label.h> # define HAVE_JUMP_LABEL #endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */ @@ -51,8 +58,11 @@ extern void arch_jump_label_transform_static(struct jump_entry *entry, extern int jump_label_text_reserved(void *start, void *end); extern void jump_label_inc(struct jump_label_key *key); extern void jump_label_dec(struct jump_label_key *key); +extern void jump_label_dec_deferred(struct jump_label_key_deferred *key); extern bool jump_label_enabled(struct jump_label_key *key); extern void jump_label_apply_nops(struct module *mod); +extern void jump_label_rate_limit(struct jump_label_key_deferred *key, + unsigned long rl); #else /* !HAVE_JUMP_LABEL */ @@ -68,6 +78,10 @@ static __always_inline void jump_label_init(void) { } +struct jump_label_key_deferred { + struct jump_label_key key; +}; + static __always_inline bool static_branch(struct jump_label_key *key) { if (unlikely(atomic_read(&key->enabled))) @@ -85,6 +99,11 @@ static inline void jump_label_dec(struct jump_label_key *key) atomic_dec(&key->enabled); } +static inline void jump_label_dec_deferred(struct jump_label_key_deferred *key) +{ + jump_label_dec(&key->key); +} + static inline int jump_label_text_reserved(void *start, void *end) { return 0; @@ -102,6 +121,14 @@ static inline int jump_label_apply_nops(struct module *mod) { return 0; } + +static inline void jump_label_rate_limit(struct jump_label_key_deferred *key, + unsigned long rl) +{ +} #endif /* HAVE_JUMP_LABEL */ +#define jump_label_key_enabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(1), }) +#define jump_label_key_disabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(0), }) + #endif /* _LINUX_JUMP_LABEL_H */ diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 0cce2db580c..2fbd9053c2d 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -6,6 +6,7 @@ #include <linux/percpu.h> #include <linux/cpumask.h> #include <linux/interrupt.h> +#include <linux/sched.h> #include <asm/irq.h> #include <asm/cputime.h> @@ -15,21 +16,25 @@ * used by rstatd/perfmeter */ -struct cpu_usage_stat { - cputime64_t user; - cputime64_t nice; - cputime64_t system; - cputime64_t softirq; - cputime64_t irq; - cputime64_t idle; - cputime64_t iowait; - cputime64_t steal; - cputime64_t guest; - cputime64_t guest_nice; +enum cpu_usage_stat { + CPUTIME_USER, + CPUTIME_NICE, + CPUTIME_SYSTEM, + CPUTIME_SOFTIRQ, + CPUTIME_IRQ, + CPUTIME_IDLE, + CPUTIME_IOWAIT, + CPUTIME_STEAL, + CPUTIME_GUEST, + CPUTIME_GUEST_NICE, + NR_STATS, +}; + +struct kernel_cpustat { + u64 cpustat[NR_STATS]; }; struct kernel_stat { - struct cpu_usage_stat cpustat; #ifndef CONFIG_GENERIC_HARDIRQS unsigned int irqs[NR_IRQS]; #endif @@ -38,10 +43,13 @@ struct kernel_stat { }; DECLARE_PER_CPU(struct kernel_stat, kstat); +DECLARE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -#define kstat_cpu(cpu) per_cpu(kstat, cpu) /* Must have preemption disabled for this to be meaningful. */ -#define kstat_this_cpu __get_cpu_var(kstat) +#define kstat_this_cpu (&__get_cpu_var(kstat)) +#define kcpustat_this_cpu (&__get_cpu_var(kernel_cpustat)) +#define kstat_cpu(cpu) per_cpu(kstat, cpu) +#define kcpustat_cpu(cpu) per_cpu(kernel_cpustat, cpu) extern unsigned long long nr_context_switches(void); diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index b0e99898527..e23121f9d82 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -10,6 +10,8 @@ #define _INCLUDE_GUARD_LATENCYTOP_H_ #include <linux/compiler.h> +struct task_struct; + #ifdef CONFIG_LATENCYTOP #define LT_SAVECOUNT 32 @@ -23,7 +25,6 @@ struct latency_record { }; -struct task_struct; extern int latencytop_enabled; void __account_scheduler_latency(struct task_struct *task, int usecs, int inter); diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index b6a56e37284..d36619ead3b 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -343,6 +343,8 @@ extern void lockdep_trace_alloc(gfp_t mask); #define lockdep_assert_held(l) WARN_ON(debug_locks && !lockdep_is_held(l)) +#define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) + #else /* !LOCKDEP */ static inline void lockdep_off(void) @@ -392,6 +394,8 @@ struct lock_class_key { }; #define lockdep_assert_held(l) do { } while (0) +#define lockdep_recursing(tsk) (0) + #endif /* !LOCKDEP */ #ifdef CONFIG_LOCK_STAT diff --git a/include/linux/memblock.h b/include/linux/memblock.h index e6b843e16e8..a6bb1023514 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -2,8 +2,6 @@ #define _LINUX_MEMBLOCK_H #ifdef __KERNEL__ -#define MEMBLOCK_ERROR 0 - #ifdef CONFIG_HAVE_MEMBLOCK /* * Logical memory blocks. @@ -19,81 +17,161 @@ #include <linux/init.h> #include <linux/mm.h> -#include <asm/memblock.h> - #define INIT_MEMBLOCK_REGIONS 128 struct memblock_region { phys_addr_t base; phys_addr_t size; +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + int nid; +#endif }; struct memblock_type { unsigned long cnt; /* number of regions */ unsigned long max; /* size of the allocated array */ + phys_addr_t total_size; /* size of all regions */ struct memblock_region *regions; }; struct memblock { phys_addr_t current_limit; - phys_addr_t memory_size; /* Updated by memblock_analyze() */ struct memblock_type memory; struct memblock_type reserved; }; extern struct memblock memblock; extern int memblock_debug; -extern int memblock_can_resize; #define memblock_dbg(fmt, ...) \ if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) -u64 memblock_find_in_range(u64 start, u64 end, u64 size, u64 align); +phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align, int nid); +phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align); int memblock_free_reserved_regions(void); int memblock_reserve_reserved_regions(void); -extern void memblock_init(void); -extern void memblock_analyze(void); -extern long memblock_add(phys_addr_t base, phys_addr_t size); -extern long memblock_remove(phys_addr_t base, phys_addr_t size); -extern long memblock_free(phys_addr_t base, phys_addr_t size); -extern long memblock_reserve(phys_addr_t base, phys_addr_t size); +void memblock_allow_resize(void); +int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); +int memblock_add(phys_addr_t base, phys_addr_t size); +int memblock_remove(phys_addr_t base, phys_addr_t size); +int memblock_free(phys_addr_t base, phys_addr_t size); +int memblock_reserve(phys_addr_t base, phys_addr_t size); + +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, + unsigned long *out_end_pfn, int *out_nid); + +/** + * for_each_mem_pfn_range - early memory pfn range iterator + * @i: an integer used as loop variable + * @nid: node selector, %MAX_NUMNODES for all nodes + * @p_start: ptr to ulong for start pfn of the range, can be %NULL + * @p_end: ptr to ulong for end pfn of the range, can be %NULL + * @p_nid: ptr to int for nid of the range, can be %NULL + * + * Walks over configured memory ranges. Available after early_node_map is + * populated. + */ +#define for_each_mem_pfn_range(i, nid, p_start, p_end, p_nid) \ + for (i = -1, __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid); \ + i >= 0; __next_mem_pfn_range(&i, nid, p_start, p_end, p_nid)) +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + +void __next_free_mem_range(u64 *idx, int nid, phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid); + +/** + * for_each_free_mem_range - iterate through free memblock areas + * @i: u64 used as loop variable + * @nid: node selector, %MAX_NUMNODES for all nodes + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @p_nid: ptr to int for nid of the range, can be %NULL + * + * Walks over free (memory && !reserved) areas of memblock. Available as + * soon as memblock is initialized. + */ +#define for_each_free_mem_range(i, nid, p_start, p_end, p_nid) \ + for (i = 0, \ + __next_free_mem_range(&i, nid, p_start, p_end, p_nid); \ + i != (u64)ULLONG_MAX; \ + __next_free_mem_range(&i, nid, p_start, p_end, p_nid)) + +void __next_free_mem_range_rev(u64 *idx, int nid, phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid); -/* The numa aware allocator is only available if - * CONFIG_ARCH_POPULATES_NODE_MAP is set +/** + * for_each_free_mem_range_reverse - rev-iterate through free memblock areas + * @i: u64 used as loop variable + * @nid: node selector, %MAX_NUMNODES for all nodes + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @p_nid: ptr to int for nid of the range, can be %NULL + * + * Walks over free (memory && !reserved) areas of memblock in reverse + * order. Available as soon as memblock is initialized. */ -extern phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align, - int nid); -extern phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, - int nid); +#define for_each_free_mem_range_reverse(i, nid, p_start, p_end, p_nid) \ + for (i = (u64)ULLONG_MAX, \ + __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid); \ + i != (u64)ULLONG_MAX; \ + __next_free_mem_range_rev(&i, nid, p_start, p_end, p_nid)) -extern phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align); +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +int memblock_set_node(phys_addr_t base, phys_addr_t size, int nid); + +static inline void memblock_set_region_node(struct memblock_region *r, int nid) +{ + r->nid = nid; +} + +static inline int memblock_get_region_node(const struct memblock_region *r) +{ + return r->nid; +} +#else +static inline void memblock_set_region_node(struct memblock_region *r, int nid) +{ +} + +static inline int memblock_get_region_node(const struct memblock_region *r) +{ + return 0; +} +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + +phys_addr_t memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid); +phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid); + +phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align); /* Flags for memblock_alloc_base() amd __memblock_alloc_base() */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) #define MEMBLOCK_ALLOC_ACCESSIBLE 0 -extern phys_addr_t memblock_alloc_base(phys_addr_t size, - phys_addr_t align, - phys_addr_t max_addr); -extern phys_addr_t __memblock_alloc_base(phys_addr_t size, - phys_addr_t align, - phys_addr_t max_addr); -extern phys_addr_t memblock_phys_mem_size(void); -extern phys_addr_t memblock_start_of_DRAM(void); -extern phys_addr_t memblock_end_of_DRAM(void); -extern void memblock_enforce_memory_limit(phys_addr_t memory_limit); -extern int memblock_is_memory(phys_addr_t addr); -extern int memblock_is_region_memory(phys_addr_t base, phys_addr_t size); -extern int memblock_is_reserved(phys_addr_t addr); -extern int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size); - -extern void memblock_dump_all(void); - -/* Provided by the architecture */ -extern phys_addr_t memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid); -extern int memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, - phys_addr_t addr2, phys_addr_t size2); +phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, + phys_addr_t max_addr); +phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, + phys_addr_t max_addr); +phys_addr_t memblock_phys_mem_size(void); +phys_addr_t memblock_start_of_DRAM(void); +phys_addr_t memblock_end_of_DRAM(void); +void memblock_enforce_memory_limit(phys_addr_t memory_limit); +int memblock_is_memory(phys_addr_t addr); +int memblock_is_region_memory(phys_addr_t base, phys_addr_t size); +int memblock_is_reserved(phys_addr_t addr); +int memblock_is_region_reserved(phys_addr_t base, phys_addr_t size); + +extern void __memblock_dump_all(void); + +static inline void memblock_dump_all(void) +{ + if (memblock_debug) + __memblock_dump_all(); +} /** * memblock_set_current_limit - Set the current allocation limit to allow @@ -101,7 +179,7 @@ extern int memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, * accessible during boot * @limit: New limit value (physical address) */ -extern void memblock_set_current_limit(phys_addr_t limit); +void memblock_set_current_limit(phys_addr_t limit); /* @@ -154,9 +232,9 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo region++) -#ifdef ARCH_DISCARD_MEMBLOCK -#define __init_memblock __init -#define __initdata_memblock __initdata +#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK +#define __init_memblock __meminit +#define __initdata_memblock __meminitdata #else #define __init_memblock #define __initdata_memblock @@ -165,7 +243,7 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo #else static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align) { - return MEMBLOCK_ERROR; + return 0; } #endif /* CONFIG_HAVE_MEMBLOCK */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 4baadd18f4a..5d9b4c9813b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1253,41 +1253,34 @@ static inline void pgtable_page_dtor(struct page *page) extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * With CONFIG_ARCH_POPULATES_NODE_MAP set, an architecture may initialise its + * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its * zones, allocate the backing mem_map and account for memory holes in a more * architecture independent manner. This is a substitute for creating the * zone_sizes[] and zholes_size[] arrays and passing them to * free_area_init_node() * * An architecture is expected to register range of page frames backed by - * physical memory with add_active_range() before calling + * physical memory with memblock_add[_node]() before calling * free_area_init_nodes() passing in the PFN each zone ends at. At a basic * usage, an architecture is expected to do something like * * unsigned long max_zone_pfns[MAX_NR_ZONES] = {max_dma, max_normal_pfn, * max_highmem_pfn}; * for_each_valid_physical_page_range() - * add_active_range(node_id, start_pfn, end_pfn) + * memblock_add_node(base, size, nid) * free_area_init_nodes(max_zone_pfns); * - * If the architecture guarantees that there are no holes in the ranges - * registered with add_active_range(), free_bootmem_active_regions() - * will call free_bootmem_node() for each registered physical page range. - * Similarly sparse_memory_present_with_active_regions() calls - * memory_present() for each range when SPARSEMEM is enabled. + * free_bootmem_with_active_regions() calls free_bootmem_node() for each + * registered physical page range. Similarly + * sparse_memory_present_with_active_regions() calls memory_present() for + * each range when SPARSEMEM is enabled. * * See mm/page_alloc.c for more information on each function exposed by - * CONFIG_ARCH_POPULATES_NODE_MAP + * CONFIG_HAVE_MEMBLOCK_NODE_MAP. */ extern void free_area_init_nodes(unsigned long *max_zone_pfn); -extern void add_active_range(unsigned int nid, unsigned long start_pfn, - unsigned long end_pfn); -extern void remove_active_range(unsigned int nid, unsigned long start_pfn, - unsigned long end_pfn); -extern void remove_all_active_ranges(void); -void sort_node_map(void); unsigned long node_map_pfn_alignment(void); unsigned long __absent_pages_in_range(int nid, unsigned long start_pfn, unsigned long end_pfn); @@ -1300,14 +1293,11 @@ extern void free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn); int add_from_early_node_map(struct range *range, int az, int nr_range, int nid); -u64 __init find_memory_core_early(int nid, u64 size, u64 align, - u64 goal, u64 limit); -typedef int (*work_fn_t)(unsigned long, unsigned long, void *); -extern void work_with_active_regions(int nid, work_fn_t work_fn, void *data); extern void sparse_memory_present_with_active_regions(int nid); -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ -#if !defined(CONFIG_ARCH_POPULATES_NODE_MAP) && \ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + +#if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \ !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) static inline int __early_pfn_to_nid(unsigned long pfn) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 188cb2ffe8d..3ac040f1936 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -598,13 +598,13 @@ struct zonelist { #endif }; -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP struct node_active_region { unsigned long start_pfn; unsigned long end_pfn; int nid; }; -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #ifndef CONFIG_DISCONTIGMEM /* The array of struct pages - for discontigmem use pgdat->lmem_map */ @@ -720,7 +720,7 @@ extern int movable_zone; static inline int zone_movable_is_highmem(void) { -#if defined(CONFIG_HIGHMEM) && defined(CONFIG_ARCH_POPULATES_NODE_MAP) +#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE) return movable_zone == ZONE_HIGHMEM; #else return 0; @@ -938,7 +938,7 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, #endif #if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \ - !defined(CONFIG_ARCH_POPULATES_NODE_MAP) + !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) static inline unsigned long early_pfn_to_nid(unsigned long pfn) { return 0; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b1f89122bf6..08855613ceb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -54,6 +54,7 @@ enum perf_hw_id { PERF_COUNT_HW_BUS_CYCLES = 6, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7, PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8, + PERF_COUNT_HW_REF_CPU_CYCLES = 9, PERF_COUNT_HW_MAX, /* non-ABI */ }; @@ -890,6 +891,7 @@ struct perf_event_context { int nr_active; int is_active; int nr_stat; + int nr_freq; int rotate_disable; atomic_t refcount; struct task_struct *task; @@ -1063,12 +1065,12 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) } } -extern struct jump_label_key perf_sched_events; +extern struct jump_label_key_deferred perf_sched_events; static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { - if (static_branch(&perf_sched_events)) + if (static_branch(&perf_sched_events.key)) __perf_event_task_sched_in(prev, task); } @@ -1077,7 +1079,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); - if (static_branch(&perf_sched_events)) + if (static_branch(&perf_sched_events.key)) __perf_event_task_sched_out(prev, next); } diff --git a/include/linux/poison.h b/include/linux/poison.h index 79159de0e34..2110a81c5e2 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -40,12 +40,6 @@ #define RED_INACTIVE 0x09F911029D74E35BULL /* when obj is inactive */ #define RED_ACTIVE 0xD84156C5635688C0ULL /* when obj is active */ -#ifdef CONFIG_PHYS_ADDR_T_64BIT -#define MEMBLOCK_INACTIVE 0x3a84fb0144c9e71bULL -#else -#define MEMBLOCK_INACTIVE 0x44c9e71bUL -#endif - #define SLUB_RED_INACTIVE 0xbb #define SLUB_RED_ACTIVE 0xcc diff --git a/include/linux/sched.h b/include/linux/sched.h index 56fa25a5b1e..40d84481a1c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -273,9 +273,11 @@ extern int runqueue_is_locked(int cpu); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern void select_nohz_load_balancer(int stop_tick); +extern void set_cpu_sd_state_idle(void); extern int get_nohz_timer_target(void); #else static inline void select_nohz_load_balancer(int stop_tick) { } +static inline void set_cpu_sd_state_idle(void) { } #endif /* @@ -483,8 +485,8 @@ struct task_cputime { #define INIT_CPUTIME \ (struct task_cputime) { \ - .utime = cputime_zero, \ - .stime = cputime_zero, \ + .utime = 0, \ + .stime = 0, \ .sum_exec_runtime = 0, \ } @@ -901,6 +903,10 @@ struct sched_group_power { * single CPU. */ unsigned int power, power_orig; + /* + * Number of busy cpus in this group. + */ + atomic_t nr_busy_cpus; }; struct sched_group { @@ -925,6 +931,15 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) return to_cpumask(sg->cpumask); } +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_cpus(group)); +} + struct sched_domain_attr { int relax_domain_level; }; @@ -1315,8 +1330,8 @@ struct task_struct { * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ - struct task_struct *real_parent; /* real parent process */ - struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */ + struct task_struct __rcu *real_parent; /* real parent process */ + struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */ /* * children/sibling forms the list of my natural children */ diff --git a/include/linux/wait.h b/include/linux/wait.h index 3efc9f3f43a..a9ce45e8501 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -77,13 +77,13 @@ struct task_struct; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } -extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *); +extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *); #define init_waitqueue_head(q) \ do { \ static struct lock_class_key __key; \ \ - __init_waitqueue_head((q), &__key); \ + __init_waitqueue_head((q), #q, &__key); \ } while (0) #ifdef CONFIG_LOCKDEP diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 959ff18b63b..6ba596b07a7 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -331,6 +331,13 @@ DEFINE_EVENT(sched_stat_template, sched_stat_iowait, TP_ARGS(tsk, delay)); /* + * Tracepoint for accounting blocked time (time the task is in uninterruptible). + */ +DEFINE_EVENT(sched_stat_template, sched_stat_blocked, + TP_PROTO(struct task_struct *tsk, u64 delay), + TP_ARGS(tsk, delay)); + +/* * Tracepoint for accounting runtime (time the task is executing * on a CPU). */ @@ -363,6 +370,56 @@ TRACE_EVENT(sched_stat_runtime, (unsigned long long)__entry->vruntime) ); +#ifdef CREATE_TRACE_POINTS +static inline u64 trace_get_sleeptime(struct task_struct *tsk) +{ +#ifdef CONFIG_SCHEDSTATS + u64 block, sleep; + + block = tsk->se.statistics.block_start; + sleep = tsk->se.statistics.sleep_start; + tsk->se.statistics.block_start = 0; + tsk->se.statistics.sleep_start = 0; + + return block ? block : sleep ? sleep : 0; +#else + return 0; +#endif +} +#endif + +/* + * Tracepoint for accounting sleeptime (time the task is sleeping + * or waiting for I/O). + */ +TRACE_EVENT(sched_stat_sleeptime, + + TP_PROTO(struct task_struct *tsk, u64 now), + + TP_ARGS(tsk, now), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( u64, sleeptime ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->sleeptime = trace_get_sleeptime(tsk); + __entry->sleeptime = __entry->sleeptime ? + now - __entry->sleeptime : 0; + ) + TP_perf_assign( + __perf_count(__entry->sleeptime); + ), + + TP_printk("comm=%s pid=%d sleeptime=%Lu [ns]", + __entry->comm, __entry->pid, + (unsigned long long)__entry->sleeptime) +); + /* * Tracepoint for showing priority inheritance modifying a tasks * priority. diff --git a/init/main.c b/init/main.c index 217ed23e948..2c76efb513c 100644 --- a/init/main.c +++ b/init/main.c @@ -469,13 +469,12 @@ asmlinkage void __init start_kernel(void) char * command_line; extern const struct kernel_param __start___param[], __stop___param[]; - smp_setup_processor_id(); - /* * Need to run as early as possible, to initialize the * lockdep hash: */ lockdep_init(); + smp_setup_processor_id(); debug_objects_early_init(); /* diff --git a/kernel/Makefile b/kernel/Makefile index e898c5b9d02..f70396e5a24 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -2,16 +2,15 @@ # Makefile for the linux kernel. # -obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ +obj-y = fork.o exec_domain.o panic.o printk.o \ cpu.o exit.o itimer.o time.o softirq.o resource.o \ sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o sched_clock.o cred.o \ - async.o range.o -obj-y += groups.o + notifier.o ksysfs.o cred.o \ + async.o range.o groups.o ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files @@ -20,10 +19,11 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg -CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_irq_work.o = -pg endif +obj-y += sched/ + obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o @@ -99,7 +99,6 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ -obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_CPU_PM) += cpu_pm.o @@ -110,15 +109,6 @@ obj-$(CONFIG_PADATA) += padata.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_JUMP_LABEL) += jump_label.o -ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) -# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is -# needed for x86 only. Why this used to be enabled for all architectures is beyond -# me. I suspect most platforms don't need this, but until we know that for sure -# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k -# to get a correct value for the wait-channel (WCHAN in ps). --davidm -CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer -endif - $(obj)/configs.o: $(obj)/config_data.h # config_data.h contains the same information as ikconfig.h but gzipped. diff --git a/kernel/acct.c b/kernel/acct.c index fa7eb3de2dd..203dfead2e0 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -613,8 +613,8 @@ void acct_collect(long exitcode, int group_dead) pacct->ac_flag |= ACORE; if (current->flags & PF_SIGNALED) pacct->ac_flag |= AXSIG; - pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); - pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); + pacct->ac_utime += current->utime; + pacct->ac_stime += current->stime; pacct->ac_minflt += current->min_flt; pacct->ac_majflt += current->maj_flt; spin_unlock_irq(¤t->sighand->siglock); diff --git a/kernel/cpu.c b/kernel/cpu.c index 9d448ddb224..5ca38d5d238 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -178,8 +178,7 @@ static inline void check_for_tasks(int cpu) write_lock_irq(&tasklist_lock); for_each_process(p) { if (task_cpu(p) == cpu && p->state == TASK_RUNNING && - (!cputime_eq(p->utime, cputime_zero) || - !cputime_eq(p->stime, cputime_zero))) + (p->utime || p->stime)) printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " "(state = %ld, flags = %x)\n", p->comm, task_pid_nr(p), cpu, diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 89e5e8aa4c3..22d901f9caf 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_core.o = -pg endif -obj-y := core.o ring_buffer.o +obj-y := core.o ring_buffer.o callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c new file mode 100644 index 00000000000..057e24b665c --- /dev/null +++ b/kernel/events/callchain.c @@ -0,0 +1,191 @@ +/* + * Performance events callchain code, extracted from core.c: + * + * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * For licensing details see kernel-base/COPYING + */ + +#include <linux/perf_event.h> +#include <linux/slab.h> +#include "internal.h" + +struct callchain_cpus_entries { + struct rcu_head rcu_head; + struct perf_callchain_entry *cpu_entries[0]; +}; + +static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); +static atomic_t nr_callchain_events; +static DEFINE_MUTEX(callchain_mutex); +static struct callchain_cpus_entries *callchain_cpus_entries; + + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static void release_callchain_buffers_rcu(struct rcu_head *head) +{ + struct callchain_cpus_entries *entries; + int cpu; + + entries = container_of(head, struct callchain_cpus_entries, rcu_head); + + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + + kfree(entries); +} + +static void release_callchain_buffers(void) +{ + struct callchain_cpus_entries *entries; + + entries = callchain_cpus_entries; + rcu_assign_pointer(callchain_cpus_entries, NULL); + call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); +} + +static int alloc_callchain_buffers(void) +{ + int cpu; + int size; + struct callchain_cpus_entries *entries; + + /* + * We can't use the percpu allocation API for data that can be + * accessed from NMI. Use a temporary manual per cpu allocation + * until that gets sorted out. + */ + size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); + + entries = kzalloc(size, GFP_KERNEL); + if (!entries) + return -ENOMEM; + + size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + + for_each_possible_cpu(cpu) { + entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!entries->cpu_entries[cpu]) + goto fail; + } + + rcu_assign_pointer(callchain_cpus_entries, entries); + + return 0; + +fail: + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + kfree(entries); + + return -ENOMEM; +} + +int get_callchain_buffers(void) +{ + int err = 0; + int count; + + mutex_lock(&callchain_mutex); + + count = atomic_inc_return(&nr_callchain_events); + if (WARN_ON_ONCE(count < 1)) { + err = -EINVAL; + goto exit; + } + + if (count > 1) { + /* If the allocation failed, give up */ + if (!callchain_cpus_entries) + err = -ENOMEM; + goto exit; + } + + err = alloc_callchain_buffers(); + if (err) + release_callchain_buffers(); +exit: + mutex_unlock(&callchain_mutex); + + return err; +} + +void put_callchain_buffers(void) +{ + if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { + release_callchain_buffers(); + mutex_unlock(&callchain_mutex); + } +} + +static struct perf_callchain_entry *get_callchain_entry(int *rctx) +{ + int cpu; + struct callchain_cpus_entries *entries; + + *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + if (*rctx == -1) + return NULL; + + entries = rcu_dereference(callchain_cpus_entries); + if (!entries) + return NULL; + + cpu = smp_processor_id(); + + return &entries->cpu_entries[cpu][*rctx]; +} + +static void +put_callchain_entry(int rctx) +{ + put_recursion_context(__get_cpu_var(callchain_recursion), rctx); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + int rctx; + struct perf_callchain_entry *entry; + + + entry = get_callchain_entry(&rctx); + if (rctx == -1) + return NULL; + + if (!entry) + goto exit_put; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } + +exit_put: + put_callchain_entry(rctx); + + return entry; +} diff --git a/kernel/events/core.c b/kernel/events/core.c index fc0e7ff11dd..890eb02c2f2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -128,7 +128,7 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct jump_label_key perf_sched_events __read_mostly; +struct jump_label_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static atomic_t nr_mmap_events __read_mostly; @@ -1130,6 +1130,8 @@ event_sched_out(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu--; ctx->nr_active--; + if (event->attr.freq && event->attr.sample_freq) + ctx->nr_freq--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; } @@ -1325,6 +1327,7 @@ retry: } raw_spin_unlock_irq(&ctx->lock); } +EXPORT_SYMBOL_GPL(perf_event_disable); static void perf_set_shadow_time(struct perf_event *event, struct perf_event_context *ctx, @@ -1406,6 +1409,8 @@ event_sched_in(struct perf_event *event, if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; + if (event->attr.freq && event->attr.sample_freq) + ctx->nr_freq++; if (event->attr.exclusive) cpuctx->exclusive = 1; @@ -1662,8 +1667,7 @@ retry: * Note: this works for group members as well as group leaders * since the non-leader members' sibling_lists will be empty. */ -static void __perf_event_mark_enabled(struct perf_event *event, - struct perf_event_context *ctx) +static void __perf_event_mark_enabled(struct perf_event *event) { struct perf_event *sub; u64 tstamp = perf_event_time(event); @@ -1701,7 +1705,7 @@ static int __perf_event_enable(void *info) */ perf_cgroup_set_timestamp(current, ctx); - __perf_event_mark_enabled(event, ctx); + __perf_event_mark_enabled(event); if (!event_filter_match(event)) { if (is_cgroup_event(event)) @@ -1782,7 +1786,7 @@ void perf_event_enable(struct perf_event *event) retry: if (!ctx->is_active) { - __perf_event_mark_enabled(event, ctx); + __perf_event_mark_enabled(event); goto out; } @@ -1809,6 +1813,7 @@ retry: out: raw_spin_unlock_irq(&ctx->lock); } +EXPORT_SYMBOL_GPL(perf_event_enable); int perf_event_refresh(struct perf_event *event, int refresh) { @@ -2327,6 +2332,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) u64 interrupts, now; s64 delta; + if (!ctx->nr_freq) + return; + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) continue; @@ -2382,12 +2390,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) { u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; struct perf_event_context *ctx = NULL; - int rotate = 0, remove = 1; + int rotate = 0, remove = 1, freq = 0; if (cpuctx->ctx.nr_events) { remove = 0; if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) rotate = 1; + if (cpuctx->ctx.nr_freq) + freq = 1; } ctx = cpuctx->task_ctx; @@ -2395,33 +2405,40 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx) remove = 0; if (ctx->nr_events != ctx->nr_active) rotate = 1; + if (ctx->nr_freq) + freq = 1; } + if (!rotate && !freq) + goto done; + perf_ctx_lock(cpuctx, cpuctx->task_ctx); perf_pmu_disable(cpuctx->ctx.pmu); - perf_ctx_adjust_freq(&cpuctx->ctx, interval); - if (ctx) - perf_ctx_adjust_freq(ctx, interval); - if (!rotate) - goto done; + if (freq) { + perf_ctx_adjust_freq(&cpuctx->ctx, interval); + if (ctx) + perf_ctx_adjust_freq(ctx, interval); + } - cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - if (ctx) - ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); + if (rotate) { + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (ctx) + ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); - perf_event_sched_in(cpuctx, ctx, current); + perf_event_sched_in(cpuctx, ctx, current); + } + + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); done: if (remove) list_del_init(&cpuctx->rotation_list); - - perf_pmu_enable(cpuctx->ctx.pmu); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } void perf_event_task_tick(void) @@ -2448,7 +2465,7 @@ static int event_enable_on_exec(struct perf_event *event, if (event->state >= PERF_EVENT_STATE_INACTIVE) return 0; - __perf_event_mark_enabled(event, ctx); + __perf_event_mark_enabled(event); return 1; } @@ -2480,13 +2497,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx) raw_spin_lock(&ctx->lock); task_ctx_sched_out(ctx); - list_for_each_entry(event, &ctx->pinned_groups, group_entry) { - ret = event_enable_on_exec(event, ctx); - if (ret) - enabled = 1; - } - - list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + list_for_each_entry(event, &ctx->event_list, event_entry) { ret = event_enable_on_exec(event, ctx); if (ret) enabled = 1; @@ -2574,215 +2585,6 @@ static u64 perf_event_read(struct perf_event *event) } /* - * Callchain support - */ - -struct callchain_cpus_entries { - struct rcu_head rcu_head; - struct perf_callchain_entry *cpu_entries[0]; -}; - -static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); -static atomic_t nr_callchain_events; -static DEFINE_MUTEX(callchain_mutex); -struct callchain_cpus_entries *callchain_cpus_entries; - - -__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -__weak void perf_callchain_user(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -static void release_callchain_buffers_rcu(struct rcu_head *head) -{ - struct callchain_cpus_entries *entries; - int cpu; - - entries = container_of(head, struct callchain_cpus_entries, rcu_head); - - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - - kfree(entries); -} - -static void release_callchain_buffers(void) -{ - struct callchain_cpus_entries *entries; - - entries = callchain_cpus_entries; - rcu_assign_pointer(callchain_cpus_entries, NULL); - call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); -} - -static int alloc_callchain_buffers(void) -{ - int cpu; - int size; - struct callchain_cpus_entries *entries; - - /* - * We can't use the percpu allocation API for data that can be - * accessed from NMI. Use a temporary manual per cpu allocation - * until that gets sorted out. - */ - size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); - - entries = kzalloc(size, GFP_KERNEL); - if (!entries) - return -ENOMEM; - - size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; - - for_each_possible_cpu(cpu) { - entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, - cpu_to_node(cpu)); - if (!entries->cpu_entries[cpu]) - goto fail; - } - - rcu_assign_pointer(callchain_cpus_entries, entries); - - return 0; - -fail: - for_each_possible_cpu(cpu) - kfree(entries->cpu_entries[cpu]); - kfree(entries); - - return -ENOMEM; -} - -static int get_callchain_buffers(void) -{ - int err = 0; - int count; - - mutex_lock(&callchain_mutex); - - count = atomic_inc_return(&nr_callchain_events); - if (WARN_ON_ONCE(count < 1)) { - err = -EINVAL; - goto exit; - } - - if (count > 1) { - /* If the allocation failed, give up */ - if (!callchain_cpus_entries) - err = -ENOMEM; - goto exit; - } - - err = alloc_callchain_buffers(); - if (err) - release_callchain_buffers(); -exit: - mutex_unlock(&callchain_mutex); - - return err; -} - -static void put_callchain_buffers(void) -{ - if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { - release_callchain_buffers(); - mutex_unlock(&callchain_mutex); - } -} - -static int get_recursion_context(int *recursion) -{ - int rctx; - - if (in_nmi()) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_softirq()) - rctx = 1; - else - rctx = 0; - - if (recursion[rctx]) - return -1; - - recursion[rctx]++; - barrier(); - - return rctx; -} - -static inline void put_recursion_context(int *recursion, int rctx) -{ - barrier(); - recursion[rctx]--; -} - -static struct perf_callchain_entry *get_callchain_entry(int *rctx) -{ - int cpu; - struct callchain_cpus_entries *entries; - - *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); - if (*rctx == -1) - return NULL; - - entries = rcu_dereference(callchain_cpus_entries); - if (!entries) - return NULL; - - cpu = smp_processor_id(); - - return &entries->cpu_entries[cpu][*rctx]; -} - -static void -put_callchain_entry(int rctx) -{ - put_recursion_context(__get_cpu_var(callchain_recursion), rctx); -} - -static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - int rctx; - struct perf_callchain_entry *entry; - - - entry = get_callchain_entry(&rctx); - if (rctx == -1) - return NULL; - - if (!entry) - goto exit_put; - - entry->nr = 0; - - if (!user_mode(regs)) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(entry, regs); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); - } - -exit_put: - put_callchain_entry(rctx); - - return entry; -} - -/* * Initialize the perf_event context in a task_struct: */ static void __perf_event_init_context(struct perf_event_context *ctx) @@ -2946,7 +2748,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec(&perf_sched_events); + jump_label_dec_deferred(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2957,7 +2759,7 @@ static void free_event(struct perf_event *event) put_callchain_buffers(); if (is_cgroup_event(event)) { atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec(&perf_sched_events); + jump_label_dec_deferred(&perf_sched_events); } } @@ -4820,7 +4622,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, struct hw_perf_event *hwc = &event->hw; int throttle = 0; - data->period = event->hw.last_period; if (!overflow) overflow = perf_swevent_set_period(event); @@ -4854,6 +4655,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, if (!is_sampling_event(event)) return; + if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { + data->period = nr; + return perf_swevent_overflow(event, 1, data, regs); + } else + data->period = event->hw.last_period; + if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) return perf_swevent_overflow(event, 1, data, regs); @@ -5981,7 +5788,7 @@ done: if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_sched_events); + jump_label_inc(&perf_sched_events.key); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -6219,7 +6026,7 @@ SYSCALL_DEFINE5(perf_event_open, * - that may need work on context switch */ atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events); + jump_label_inc(&perf_sched_events.key); } /* @@ -7065,6 +6872,9 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); + + /* do not patch jump label more than once per second */ + jump_label_rate_limit(&perf_sched_events, HZ); } static int __init perf_event_sysfs_init(void) diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 64568a69937..b0b107f90af 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -1,6 +1,10 @@ #ifndef _KERNEL_EVENTS_INTERNAL_H #define _KERNEL_EVENTS_INTERNAL_H +#include <linux/hardirq.h> + +/* Buffer handling */ + #define RING_BUFFER_WRITABLE 0x01 struct ring_buffer { @@ -67,7 +71,7 @@ static inline int page_order(struct ring_buffer *rb) } #endif -static unsigned long perf_data_size(struct ring_buffer *rb) +static inline unsigned long perf_data_size(struct ring_buffer *rb) { return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } @@ -96,4 +100,37 @@ __output_copy(struct perf_output_handle *handle, } while (len); } +/* Callchain handling */ +extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); +extern int get_callchain_buffers(void); +extern void put_callchain_buffers(void); + +static inline int get_recursion_context(int *recursion) +{ + int rctx; + + if (in_nmi()) + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; + + if (recursion[rctx]) + return -1; + + recursion[rctx]++; + barrier(); + + return rctx; +} + +static inline void put_recursion_context(int *recursion, int rctx) +{ + barrier(); + recursion[rctx]--; +} + #endif /* _KERNEL_EVENTS_INTERNAL_H */ diff --git a/kernel/exit.c b/kernel/exit.c index e6e01b959a0..d579a459309 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -121,9 +121,9 @@ static void __exit_signal(struct task_struct *tsk) * We won't ever get here for the group leader, since it * will have been the last reference on the signal_struct. */ - sig->utime = cputime_add(sig->utime, tsk->utime); - sig->stime = cputime_add(sig->stime, tsk->stime); - sig->gtime = cputime_add(sig->gtime, tsk->gtime); + sig->utime += tsk->utime; + sig->stime += tsk->stime; + sig->gtime += tsk->gtime; sig->min_flt += tsk->min_flt; sig->maj_flt += tsk->maj_flt; sig->nvcsw += tsk->nvcsw; @@ -1255,19 +1255,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) spin_lock_irq(&p->real_parent->sighand->siglock); psig = p->real_parent->signal; sig = p->signal; - psig->cutime = - cputime_add(psig->cutime, - cputime_add(tgutime, - sig->cutime)); - psig->cstime = - cputime_add(psig->cstime, - cputime_add(tgstime, - sig->cstime)); - psig->cgtime = - cputime_add(psig->cgtime, - cputime_add(p->gtime, - cputime_add(sig->gtime, - sig->cgtime))); + psig->cutime += tgutime + sig->cutime; + psig->cstime += tgstime + sig->cstime; + psig->cgtime += p->gtime + sig->gtime + sig->cgtime; psig->cmin_flt += p->min_flt + sig->min_flt + sig->cmin_flt; psig->cmaj_flt += diff --git a/kernel/fork.c b/kernel/fork.c index da4a6a10d08..b058c5820ec 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1023,8 +1023,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p) */ static void posix_cpu_timers_init(struct task_struct *tsk) { - tsk->cputime_expires.prof_exp = cputime_zero; - tsk->cputime_expires.virt_exp = cputime_zero; + tsk->cputime_expires.prof_exp = 0; + tsk->cputime_expires.virt_exp = 0; tsk->cputime_expires.sched_exp = 0; INIT_LIST_HEAD(&tsk->cpu_timers[0]); INIT_LIST_HEAD(&tsk->cpu_timers[1]); @@ -1132,14 +1132,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, init_sigpending(&p->pending); - p->utime = cputime_zero; - p->stime = cputime_zero; - p->gtime = cputime_zero; - p->utimescaled = cputime_zero; - p->stimescaled = cputime_zero; + p->utime = p->stime = p->gtime = 0; + p->utimescaled = p->stimescaled = 0; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - p->prev_utime = cputime_zero; - p->prev_stime = cputime_zero; + p->prev_utime = p->prev_stime = 0; #endif #if defined(SPLIT_RSS_COUNTING) memset(&p->rss_stat, 0, sizeof(p->rss_stat)); diff --git a/kernel/itimer.c b/kernel/itimer.c index d802883153d..22000c3db0d 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, cval = it->expires; cinterval = it->incr; - if (!cputime_eq(cval, cputime_zero)) { + if (cval) { struct task_cputime cputime; cputime_t t; thread_group_cputimer(tsk, &cputime); if (clock_id == CPUCLOCK_PROF) - t = cputime_add(cputime.utime, cputime.stime); + t = cputime.utime + cputime.stime; else /* CPUCLOCK_VIRT */ t = cputime.utime; - if (cputime_le(cval, t)) + if (cval < t) /* about to fire */ cval = cputime_one_jiffy; else - cval = cputime_sub(cval, t); + cval = cval - t; } spin_unlock_irq(&tsk->sighand->siglock); @@ -161,10 +161,9 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, cval = it->expires; cinterval = it->incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, cputime_one_jiffy); + if (cval || nval) { + if (nval > 0) + nval += cputime_one_jiffy; set_process_cpu_timer(tsk, clock_id, &nval, &cval); } it->expires = nval; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 66ff7109f69..30c3c770813 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -72,15 +72,46 @@ void jump_label_inc(struct jump_label_key *key) jump_label_unlock(); } -void jump_label_dec(struct jump_label_key *key) +static void __jump_label_dec(struct jump_label_key *key, + unsigned long rate_limit, struct delayed_work *work) { if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) return; - jump_label_update(key, JUMP_LABEL_DISABLE); + if (rate_limit) { + atomic_inc(&key->enabled); + schedule_delayed_work(work, rate_limit); + } else + jump_label_update(key, JUMP_LABEL_DISABLE); + jump_label_unlock(); } +static void jump_label_update_timeout(struct work_struct *work) +{ + struct jump_label_key_deferred *key = + container_of(work, struct jump_label_key_deferred, work.work); + __jump_label_dec(&key->key, 0, NULL); +} + +void jump_label_dec(struct jump_label_key *key) +{ + __jump_label_dec(key, 0, NULL); +} + +void jump_label_dec_deferred(struct jump_label_key_deferred *key) +{ + __jump_label_dec(&key->key, key->timeout, &key->work); +} + + +void jump_label_rate_limit(struct jump_label_key_deferred *key, + unsigned long rl) +{ + key->timeout = rl; + INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); +} + static int addr_conflict(struct jump_entry *entry, void *start, void *end) { if (entry->code <= (unsigned long)end && @@ -111,7 +142,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start, * running code can override this to make the non-live update case * cheaper. */ -void __weak arch_jump_label_transform_static(struct jump_entry *entry, +void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { arch_jump_label_transform(entry, type); @@ -217,8 +248,13 @@ void jump_label_apply_nops(struct module *mod) if (iter_start == iter_stop) return; - for (iter = iter_start; iter < iter_stop; iter++) - arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); + for (iter = iter_start; iter < iter_stop; iter++) { + struct jump_label_key *iterk; + + iterk = (struct jump_label_key *)(unsigned long)iter->key; + arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? + JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + } } static int jump_label_add_module(struct module *mod) @@ -258,8 +294,7 @@ static int jump_label_add_module(struct module *mod) key->next = jlm; if (jump_label_enabled(key)) - __jump_label_update(key, iter, iter_stop, - JUMP_LABEL_ENABLE); + __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); } return 0; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f45c6817770..8889f7dd7c4 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -431,6 +431,7 @@ unsigned int max_lockdep_depth; * about it later on, in lockdep_info(). */ static int lockdep_init_error; +static const char *lock_init_error; static unsigned long lockdep_init_trace_data[20]; static struct stack_trace lockdep_init_trace = { .max_entries = ARRAY_SIZE(lockdep_init_trace_data), @@ -499,36 +500,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) usage[i] = '\0'; } -static int __print_lock_name(struct lock_class *class) +static void __print_lock_name(struct lock_class *class) { char str[KSYM_NAME_LEN]; const char *name; name = class->name; - if (!name) - name = __get_key_name(class->key, str); - - return printk("%s", name); -} - -static void print_lock_name(struct lock_class *class) -{ - char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; - const char *name; - - get_usage_chars(class, usage); - - name = class->name; if (!name) { name = __get_key_name(class->key, str); - printk(" (%s", name); + printk("%s", name); } else { - printk(" (%s", name); + printk("%s", name); if (class->name_version > 1) printk("#%d", class->name_version); if (class->subclass) printk("/%d", class->subclass); } +} + +static void print_lock_name(struct lock_class *class) +{ + char usage[LOCK_USAGE_CHARS]; + + get_usage_chars(class, usage); + + printk(" ("); + __print_lock_name(class); printk("){%s}", usage); } @@ -568,11 +565,12 @@ static void lockdep_print_held_locks(struct task_struct *curr) } } -static void print_kernel_version(void) +static void print_kernel_ident(void) { - printk("%s %.*s\n", init_utsname()->release, + printk("%s %.*s %s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); + init_utsname()->version, + print_tainted()); } static int very_verbose(struct lock_class *class) @@ -656,6 +654,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) if (unlikely(!lockdep_initialized)) { lockdep_init(); lockdep_init_error = 1; + lock_init_error = lock->name; save_stack_trace(&lockdep_init_trace); } #endif @@ -723,7 +722,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) class = look_up_lock_class(lock, subclass); if (likely(class)) - return class; + goto out_set_class_cache; /* * Debug-check: all keys must be persistent! @@ -808,6 +807,7 @@ out_unlock_set: graph_unlock(); raw_local_irq_restore(flags); +out_set_class_cache: if (!subclass || force) lock->class_cache[0] = class; else if (subclass < NR_LOCKDEP_CACHING_CLASSES) @@ -1149,7 +1149,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, printk("\n"); printk("======================================================\n"); printk("[ INFO: possible circular locking dependency detected ]\n"); - print_kernel_version(); + print_kernel_ident(); printk("-------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); @@ -1488,7 +1488,7 @@ print_bad_irq_dependency(struct task_struct *curr, printk("======================================================\n"); printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", irqclass, irqclass); - print_kernel_version(); + print_kernel_ident(); printk("------------------------------------------------------\n"); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), @@ -1717,7 +1717,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, printk("\n"); printk("=============================================\n"); printk("[ INFO: possible recursive locking detected ]\n"); - print_kernel_version(); + print_kernel_ident(); printk("---------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); @@ -2224,7 +2224,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, printk("\n"); printk("=================================\n"); printk("[ INFO: inconsistent lock state ]\n"); - print_kernel_version(); + print_kernel_ident(); printk("---------------------------------\n"); printk("inconsistent {%s} -> {%s} usage.\n", @@ -2289,7 +2289,7 @@ print_irq_inversion_bug(struct task_struct *curr, printk("\n"); printk("=========================================================\n"); printk("[ INFO: possible irq lock inversion dependency detected ]\n"); - print_kernel_version(); + print_kernel_ident(); printk("---------------------------------------------------------\n"); printk("%s/%d just changed the state of lock:\n", curr->comm, task_pid_nr(curr)); @@ -3175,6 +3175,7 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, printk("\n"); printk("=====================================\n"); printk("[ BUG: bad unlock balance detected! ]\n"); + print_kernel_ident(); printk("-------------------------------------\n"); printk("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); @@ -3619,6 +3620,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, printk("\n"); printk("=================================\n"); printk("[ BUG: bad contention detected! ]\n"); + print_kernel_ident(); printk("---------------------------------\n"); printk("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); @@ -3974,7 +3976,8 @@ void __init lockdep_info(void) #ifdef CONFIG_DEBUG_LOCKDEP if (lockdep_init_error) { - printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); + printk("WARNING: lockdep init error! lock-%s was acquired" + "before lockdep_init\n", lock_init_error); printk("Call stack leading to lockdep invocation was:\n"); print_stack_trace(&lockdep_init_trace, 0); } @@ -3993,6 +3996,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, printk("\n"); printk("=========================\n"); printk("[ BUG: held lock freed! ]\n"); + print_kernel_ident(); printk("-------------------------\n"); printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); @@ -4050,6 +4054,7 @@ static void print_held_locks_bug(struct task_struct *curr) printk("\n"); printk("=====================================\n"); printk("[ BUG: lock held at task exit time! ]\n"); + print_kernel_ident(); printk("-------------------------------------\n"); printk("%s/%d is exiting with locks still held!\n", curr->comm, task_pid_nr(curr)); @@ -4147,6 +4152,7 @@ void lockdep_sys_exit(void) printk("\n"); printk("================================================\n"); printk("[ BUG: lock held when returning to user space! ]\n"); + print_kernel_ident(); printk("------------------------------------------------\n"); printk("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); @@ -4166,6 +4172,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) printk("\n"); printk("===============================\n"); printk("[ INFO: suspicious RCU usage. ]\n"); + print_kernel_ident(); printk("-------------------------------\n"); printk("%s:%d %s!\n", file, line, s); printk("\nother info that might help us debug this:\n\n"); diff --git a/kernel/panic.c b/kernel/panic.c index b2659360421..3458469eb7c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -237,11 +237,20 @@ void add_taint(unsigned flag) * Can't trust the integrity of the kernel anymore. * We don't call directly debug_locks_off() because the issue * is not necessarily serious enough to set oops_in_progress to 1 - * Also we want to keep up lockdep for staging development and - * post-warning case. + * Also we want to keep up lockdep for staging/out-of-tree + * development and post-warning case. */ - if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off()) - printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); + switch (flag) { + case TAINT_CRAP: + case TAINT_OOT_MODULE: + case TAINT_WARN: + case TAINT_FIRMWARE_WORKAROUND: + break; + + default: + if (__debug_locks_off()) + printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); + } set_bit(flag, &tainted_mask); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e7cb76dc18f..125cb67daa2 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock, if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { return now.sched < then.sched; } else { - return cputime_lt(now.cpu, then.cpu); + return now.cpu < then.cpu; } } static inline void cpu_time_add(const clockid_t which_clock, @@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock, if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { acc->sched += val.sched; } else { - acc->cpu = cputime_add(acc->cpu, val.cpu); + acc->cpu += val.cpu; } } static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, @@ -98,25 +98,12 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { a.sched -= b.sched; } else { - a.cpu = cputime_sub(a.cpu, b.cpu); + a.cpu -= b.cpu; } return a; } /* - * Divide and limit the result to res >= 1 - * - * This is necessary to prevent signal delivery starvation, when the result of - * the division would be rounded down to 0. - */ -static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) -{ - cputime_t res = cputime_div(time, div); - - return max_t(cputime_t, res, 1); -} - -/* * Update expiry time from increment, and increase overrun count, * given the current clock sample. */ @@ -148,28 +135,26 @@ static void bump_cpu_timer(struct k_itimer *timer, } else { cputime_t delta, incr; - if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) + if (now.cpu < timer->it.cpu.expires.cpu) return; incr = timer->it.cpu.incr.cpu; - delta = cputime_sub(cputime_add(now.cpu, incr), - timer->it.cpu.expires.cpu); + delta = now.cpu + incr - timer->it.cpu.expires.cpu; /* Don't use (incr*2 < delta), incr*2 might overflow. */ - for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) - incr = cputime_add(incr, incr); - for (; i >= 0; incr = cputime_halve(incr), i--) { - if (cputime_lt(delta, incr)) + for (i = 0; incr < delta - incr; i++) + incr += incr; + for (; i >= 0; incr = incr >> 1, i--) { + if (delta < incr) continue; - timer->it.cpu.expires.cpu = - cputime_add(timer->it.cpu.expires.cpu, incr); + timer->it.cpu.expires.cpu += incr; timer->it_overrun += 1 << i; - delta = cputime_sub(delta, incr); + delta -= incr; } } } static inline cputime_t prof_ticks(struct task_struct *p) { - return cputime_add(p->utime, p->stime); + return p->utime + p->stime; } static inline cputime_t virt_ticks(struct task_struct *p) { @@ -248,8 +233,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) t = tsk; do { - times->utime = cputime_add(times->utime, t->utime); - times->stime = cputime_add(times->stime, t->stime); + times->utime += t->utime; + times->stime += t->stime; times->sum_exec_runtime += task_sched_runtime(t); } while_each_thread(tsk, t); out: @@ -258,10 +243,10 @@ out: static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) { - if (cputime_gt(b->utime, a->utime)) + if (b->utime > a->utime) a->utime = b->utime; - if (cputime_gt(b->stime, a->stime)) + if (b->stime > a->stime) a->stime = b->stime; if (b->sum_exec_runtime > a->sum_exec_runtime) @@ -306,7 +291,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, return -EINVAL; case CPUCLOCK_PROF: thread_group_cputime(p, &cputime); - cpu->cpu = cputime_add(cputime.utime, cputime.stime); + cpu->cpu = cputime.utime + cputime.stime; break; case CPUCLOCK_VIRT: thread_group_cputime(p, &cputime); @@ -470,26 +455,24 @@ static void cleanup_timers(struct list_head *head, unsigned long long sum_exec_runtime) { struct cpu_timer_list *timer, *next; - cputime_t ptime = cputime_add(utime, stime); + cputime_t ptime = utime + stime; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (cputime_lt(timer->expires.cpu, ptime)) { - timer->expires.cpu = cputime_zero; + if (timer->expires.cpu < ptime) { + timer->expires.cpu = 0; } else { - timer->expires.cpu = cputime_sub(timer->expires.cpu, - ptime); + timer->expires.cpu -= ptime; } } ++head; list_for_each_entry_safe(timer, next, head, entry) { list_del_init(&timer->entry); - if (cputime_lt(timer->expires.cpu, utime)) { - timer->expires.cpu = cputime_zero; + if (timer->expires.cpu < utime) { + timer->expires.cpu = 0; } else { - timer->expires.cpu = cputime_sub(timer->expires.cpu, - utime); + timer->expires.cpu -= utime; } } @@ -520,8 +503,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk) struct signal_struct *const sig = tsk->signal; cleanup_timers(tsk->signal->cpu_timers, - cputime_add(tsk->utime, sig->utime), - cputime_add(tsk->stime, sig->stime), + tsk->utime + sig->utime, tsk->stime + sig->stime, tsk->se.sum_exec_runtime + sig->sum_sched_runtime); } @@ -540,8 +522,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) static inline int expires_gt(cputime_t expires, cputime_t new_exp) { - return cputime_eq(expires, cputime_zero) || - cputime_gt(expires, new_exp); + return expires == 0 || expires > new_exp; } /* @@ -651,7 +632,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock, default: return -EINVAL; case CPUCLOCK_PROF: - cpu->cpu = cputime_add(cputime.utime, cputime.stime); + cpu->cpu = cputime.utime + cputime.stime; break; case CPUCLOCK_VIRT: cpu->cpu = cputime.utime; @@ -918,12 +899,12 @@ static void check_thread_timers(struct task_struct *tsk, unsigned long soft; maxfire = 20; - tsk->cputime_expires.prof_exp = cputime_zero; + tsk->cputime_expires.prof_exp = 0; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { + if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) { tsk->cputime_expires.prof_exp = t->expires.cpu; break; } @@ -933,12 +914,12 @@ static void check_thread_timers(struct task_struct *tsk, ++timers; maxfire = 20; - tsk->cputime_expires.virt_exp = cputime_zero; + tsk->cputime_expires.virt_exp = 0; while (!list_empty(timers)) { struct cpu_timer_list *t = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { + if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) { tsk->cputime_expires.virt_exp = t->expires.cpu; break; } @@ -1009,20 +990,19 @@ static u32 onecputick; static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, cputime_t *expires, cputime_t cur_time, int signo) { - if (cputime_eq(it->expires, cputime_zero)) + if (!it->expires) return; - if (cputime_ge(cur_time, it->expires)) { - if (!cputime_eq(it->incr, cputime_zero)) { - it->expires = cputime_add(it->expires, it->incr); + if (cur_time >= it->expires) { + if (it->incr) { + it->expires += it->incr; it->error += it->incr_error; if (it->error >= onecputick) { - it->expires = cputime_sub(it->expires, - cputime_one_jiffy); + it->expires -= cputime_one_jiffy; it->error -= onecputick; } } else { - it->expires = cputime_zero; + it->expires = 0; } trace_itimer_expire(signo == SIGPROF ? @@ -1031,9 +1011,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); } - if (!cputime_eq(it->expires, cputime_zero) && - (cputime_eq(*expires, cputime_zero) || - cputime_lt(it->expires, *expires))) { + if (it->expires && (!*expires || it->expires < *expires)) { *expires = it->expires; } } @@ -1048,9 +1026,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, */ static inline int task_cputime_zero(const struct task_cputime *cputime) { - if (cputime_eq(cputime->utime, cputime_zero) && - cputime_eq(cputime->stime, cputime_zero) && - cputime->sum_exec_runtime == 0) + if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime) return 1; return 0; } @@ -1076,15 +1052,15 @@ static void check_process_timers(struct task_struct *tsk, */ thread_group_cputimer(tsk, &cputime); utime = cputime.utime; - ptime = cputime_add(utime, cputime.stime); + ptime = utime + cputime.stime; sum_sched_runtime = cputime.sum_exec_runtime; maxfire = 20; - prof_expires = cputime_zero; + prof_expires = 0; while (!list_empty(timers)) { struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { + if (!--maxfire || ptime < tl->expires.cpu) { prof_expires = tl->expires.cpu; break; } @@ -1094,12 +1070,12 @@ static void check_process_timers(struct task_struct *tsk, ++timers; maxfire = 20; - virt_expires = cputime_zero; + virt_expires = 0; while (!list_empty(timers)) { struct cpu_timer_list *tl = list_first_entry(timers, struct cpu_timer_list, entry); - if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { + if (!--maxfire || utime < tl->expires.cpu) { virt_expires = tl->expires.cpu; break; } @@ -1154,8 +1130,7 @@ static void check_process_timers(struct task_struct *tsk, } } x = secs_to_cputime(soft); - if (cputime_eq(prof_expires, cputime_zero) || - cputime_lt(x, prof_expires)) { + if (!prof_expires || x < prof_expires) { prof_expires = x; } } @@ -1249,12 +1224,9 @@ out: static inline int task_cputime_expired(const struct task_cputime *sample, const struct task_cputime *expires) { - if (!cputime_eq(expires->utime, cputime_zero) && - cputime_ge(sample->utime, expires->utime)) + if (expires->utime && sample->utime >= expires->utime) return 1; - if (!cputime_eq(expires->stime, cputime_zero) && - cputime_ge(cputime_add(sample->utime, sample->stime), - expires->stime)) + if (expires->stime && sample->utime + sample->stime >= expires->stime) return 1; if (expires->sum_exec_runtime != 0 && sample->sum_exec_runtime >= expires->sum_exec_runtime) @@ -1389,18 +1361,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, * it to be relative, *newval argument is relative and we update * it to be absolute. */ - if (!cputime_eq(*oldval, cputime_zero)) { - if (cputime_le(*oldval, now.cpu)) { + if (*oldval) { + if (*oldval <= now.cpu) { /* Just about to fire. */ *oldval = cputime_one_jiffy; } else { - *oldval = cputime_sub(*oldval, now.cpu); + *oldval -= now.cpu; } } - if (cputime_eq(*newval, cputime_zero)) + if (!*newval) return; - *newval = cputime_add(*newval, now.cpu); + *newval += now.cpu; } /* diff --git a/kernel/printk.c b/kernel/printk.c index 7982a0a841e..989e4a52da7 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -199,7 +199,7 @@ void __init setup_log_buf(int early) unsigned long mem; mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); - if (mem == MEMBLOCK_ERROR) + if (!mem) return; new_log_buf = __va(mem); } else { @@ -688,6 +688,7 @@ static void zap_locks(void) oops_timestamp = jiffies; + debug_locks_off(); /* If a crash is occurring, make sure we can't deadlock */ raw_spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ @@ -840,9 +841,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) boot_delay_msec(); printk_delay(); - preempt_disable(); /* This stops the holder of console_sem just where we want him */ - raw_local_irq_save(flags); + local_irq_save(flags); this_cpu = smp_processor_id(); /* @@ -856,7 +856,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * recursion and return - but flag the recursion so that * it can be printed at the next appropriate moment: */ - if (!oops_in_progress) { + if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; goto out_restore_irqs; } @@ -962,9 +962,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) lockdep_on(); out_restore_irqs: - raw_local_irq_restore(flags); + local_irq_restore(flags); - preempt_enable(); return printed_len; } EXPORT_SYMBOL(printk); diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 8eafd1bd273..16502d3a71c 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -101,6 +101,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("\n============================================\n"); printk( "[ BUG: circular locking deadlock detected! ]\n"); + printk("%s\n", print_tainted()); printk( "--------------------------------------------\n"); printk("%s/%d is deadlocking current task %s/%d\n\n", task->comm, task_pid_nr(task), diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile new file mode 100644 index 00000000000..9a7dd35102a --- /dev/null +++ b/kernel/sched/Makefile @@ -0,0 +1,20 @@ +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_clock.o = -pg +endif + +ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) +# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is +# needed for x86 only. Why this used to be enabled for all architectures is beyond +# me. I suspect most platforms don't need this, but until we know that for sure +# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k +# to get a correct value for the wait-channel (WCHAN in ps). --davidm +CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer +endif + +obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o +obj-$(CONFIG_SMP) += cpupri.o +obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o +obj-$(CONFIG_SCHEDSTATS) += stats.o +obj-$(CONFIG_SCHED_DEBUG) += debug.o + + diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c index 429242f3c48..e8a1f83ee0e 100644 --- a/kernel/sched_autogroup.c +++ b/kernel/sched/auto_group.c @@ -1,15 +1,19 @@ #ifdef CONFIG_SCHED_AUTOGROUP +#include "sched.h" + #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kallsyms.h> #include <linux/utsname.h> +#include <linux/security.h> +#include <linux/export.h> unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; -static void __init autogroup_init(struct task_struct *init_task) +void __init autogroup_init(struct task_struct *init_task) { autogroup_default.tg = &root_task_group; kref_init(&autogroup_default.kref); @@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task) init_task->signal->autogroup = &autogroup_default; } -static inline void autogroup_free(struct task_group *tg) +void autogroup_free(struct task_group *tg) { kfree(tg->autogroup); } @@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p) return ag; } -#ifdef CONFIG_RT_GROUP_SCHED -static void free_rt_sched_group(struct task_group *tg); -#endif - static inline struct autogroup *autogroup_create(void) { struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); @@ -108,8 +108,7 @@ out_fail: return autogroup_kref_get(&autogroup_default); } -static inline bool -task_wants_autogroup(struct task_struct *p, struct task_group *tg) +bool task_wants_autogroup(struct task_struct *p, struct task_group *tg) { if (tg != &root_task_group) return false; @@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg) return true; } -static inline bool task_group_is_autogroup(struct task_group *tg) -{ - return !!tg->autogroup; -} - -static inline struct task_group * -autogroup_task_group(struct task_struct *p, struct task_group *tg) -{ - int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); - - if (enabled && task_wants_autogroup(p, tg)) - return p->signal->autogroup->tg; - - return tg; -} - static void autogroup_move_group(struct task_struct *p, struct autogroup *ag) { @@ -263,7 +246,7 @@ out: #endif /* CONFIG_PROC_FS */ #ifdef CONFIG_SCHED_DEBUG -static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +int autogroup_path(struct task_group *tg, char *buf, int buflen) { if (!task_group_is_autogroup(tg)) return 0; diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h index c2f0e7248dc..8bd04714281 100644 --- a/kernel/sched_autogroup.h +++ b/kernel/sched/auto_group.h @@ -1,5 +1,8 @@ #ifdef CONFIG_SCHED_AUTOGROUP +#include <linux/kref.h> +#include <linux/rwsem.h> + struct autogroup { /* * reference doesn't mean how many thread attach to this @@ -13,9 +16,28 @@ struct autogroup { int nice; }; -static inline bool task_group_is_autogroup(struct task_group *tg); +extern void autogroup_init(struct task_struct *init_task); +extern void autogroup_free(struct task_group *tg); + +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return !!tg->autogroup; +} + +extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg); + static inline struct task_group * -autogroup_task_group(struct task_struct *p, struct task_group *tg); +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) + return p->signal->autogroup->tg; + + return tg; +} + +extern int autogroup_path(struct task_group *tg, char *buf, int buflen); #else /* !CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c index c685e31492d..c685e31492d 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched/clock.c diff --git a/kernel/sched.c b/kernel/sched/core.c index d6b149ccf92..4dbfd04a214 100644 --- a/kernel/sched.c +++ b/kernel/sched/core.c @@ -1,5 +1,5 @@ /* - * kernel/sched.c + * kernel/sched/core.c * * Kernel scheduler and related syscalls * @@ -56,7 +56,6 @@ #include <linux/percpu.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <linux/stop_machine.h> #include <linux/sysctl.h> #include <linux/syscalls.h> #include <linux/times.h> @@ -75,129 +74,17 @@ #include <asm/tlb.h> #include <asm/irq_regs.h> -#include <asm/mutex.h> #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #endif -#include "sched_cpupri.h" -#include "workqueue_sched.h" -#include "sched_autogroup.h" +#include "sched.h" +#include "../workqueue_sched.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -/* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], - * and back. - */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) -#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) - -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - -/* - * Helpers for converting nanosecond timing to jiffy resolution - */ -#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) - -#define NICE_0_LOAD SCHED_LOAD_SCALE -#define NICE_0_SHIFT SCHED_LOAD_SHIFT - -/* - * These are the 'tuning knobs' of the scheduler: - * - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. - */ -#define DEF_TIMESLICE (100 * HZ / 1000) - -/* - * single value that denotes runtime == period, ie unlimited time. - */ -#define RUNTIME_INF ((u64)~0ULL) - -static inline int rt_policy(int policy) -{ - if (policy == SCHED_FIFO || policy == SCHED_RR) - return 1; - return 0; -} - -static inline int task_has_rt_policy(struct task_struct *p) -{ - return rt_policy(p->policy); -} - -/* - * This is the priority-queue data structure of the RT scheduling class: - */ -struct rt_prio_array { - DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ - struct list_head queue[MAX_RT_PRIO]; -}; - -struct rt_bandwidth { - /* nests inside the rq lock: */ - raw_spinlock_t rt_runtime_lock; - ktime_t rt_period; - u64 rt_runtime; - struct hrtimer rt_period_timer; -}; - -static struct rt_bandwidth def_rt_bandwidth; - -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); - -static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) -{ - struct rt_bandwidth *rt_b = - container_of(timer, struct rt_bandwidth, rt_period_timer); - ktime_t now; - int overrun; - int idle = 0; - - for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, rt_b->rt_period); - - if (!overrun) - break; - - idle = do_sched_rt_period_timer(rt_b, overrun); - } - - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; -} - -static -void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) -{ - rt_b->rt_period = ns_to_ktime(period); - rt_b->rt_runtime = runtime; - - raw_spin_lock_init(&rt_b->rt_runtime_lock); - - hrtimer_init(&rt_b->rt_period_timer, - CLOCK_MONOTONIC, HRTIMER_MODE_REL); - rt_b->rt_period_timer.function = sched_rt_period_timer; -} - -static inline int rt_bandwidth_enabled(void) -{ - return sysctl_sched_rt_runtime >= 0; -} - -static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) +void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) { unsigned long delta; ktime_t soft, hard, now; @@ -217,580 +104,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) } } -static void start_rt_bandwidth(struct rt_bandwidth *rt_b) -{ - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return; - - if (hrtimer_active(&rt_b->rt_period_timer)) - return; - - raw_spin_lock(&rt_b->rt_runtime_lock); - start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); - raw_spin_unlock(&rt_b->rt_runtime_lock); -} - -#ifdef CONFIG_RT_GROUP_SCHED -static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) -{ - hrtimer_cancel(&rt_b->rt_period_timer); -} -#endif - -/* - * sched_domains_mutex serializes calls to init_sched_domains, - * detach_destroy_domains and partition_sched_domains. - */ -static DEFINE_MUTEX(sched_domains_mutex); - -#ifdef CONFIG_CGROUP_SCHED - -#include <linux/cgroup.h> - -struct cfs_rq; - -static LIST_HEAD(task_groups); - -struct cfs_bandwidth { -#ifdef CONFIG_CFS_BANDWIDTH - raw_spinlock_t lock; - ktime_t period; - u64 quota, runtime; - s64 hierarchal_quota; - u64 runtime_expires; - - int idle, timer_active; - struct hrtimer period_timer, slack_timer; - struct list_head throttled_cfs_rq; - - /* statistics */ - int nr_periods, nr_throttled; - u64 throttled_time; -#endif -}; - -/* task group related information */ -struct task_group { - struct cgroup_subsys_state css; - -#ifdef CONFIG_FAIR_GROUP_SCHED - /* schedulable entities of this group on each cpu */ - struct sched_entity **se; - /* runqueue "owned" by this group on each cpu */ - struct cfs_rq **cfs_rq; - unsigned long shares; - - atomic_t load_weight; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - struct sched_rt_entity **rt_se; - struct rt_rq **rt_rq; - - struct rt_bandwidth rt_bandwidth; -#endif - - struct rcu_head rcu; - struct list_head list; - - struct task_group *parent; - struct list_head siblings; - struct list_head children; - -#ifdef CONFIG_SCHED_AUTOGROUP - struct autogroup *autogroup; -#endif - - struct cfs_bandwidth cfs_bandwidth; -}; - -/* task_group_lock serializes the addition/removal of task groups */ -static DEFINE_SPINLOCK(task_group_lock); - -#ifdef CONFIG_FAIR_GROUP_SCHED - -# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD - -/* - * A weight of 0 or 1 can cause arithmetics problems. - * A weight of a cfs_rq is the sum of weights of which entities - * are queued on this cfs_rq, so a weight of a entity should not be - * too large, so as the shares value of a task group. - * (The default weight is 1024 - so there's no practical - * limitation from this.) - */ -#define MIN_SHARES (1UL << 1) -#define MAX_SHARES (1UL << 18) - -static int root_task_group_load = ROOT_TASK_GROUP_LOAD; -#endif - -/* Default task group. - * Every task in system belong to this group at bootup. - */ -struct task_group root_task_group; - -#endif /* CONFIG_CGROUP_SCHED */ - -/* CFS-related fields in a runqueue */ -struct cfs_rq { - struct load_weight load; - unsigned long nr_running, h_nr_running; - - u64 exec_clock; - u64 min_vruntime; -#ifndef CONFIG_64BIT - u64 min_vruntime_copy; -#endif - - struct rb_root tasks_timeline; - struct rb_node *rb_leftmost; - - struct list_head tasks; - struct list_head *balance_iterator; - - /* - * 'curr' points to currently running entity on this cfs_rq. - * It is set to NULL otherwise (i.e when none are currently running). - */ - struct sched_entity *curr, *next, *last, *skip; - -#ifdef CONFIG_SCHED_DEBUG - unsigned int nr_spread_over; -#endif - -#ifdef CONFIG_FAIR_GROUP_SCHED - struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ - - /* - * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in - * a hierarchy). Non-leaf lrqs hold other higher schedulable entities - * (like users, containers etc.) - * - * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This - * list is used during load balance. - */ - int on_list; - struct list_head leaf_cfs_rq_list; - struct task_group *tg; /* group that "owns" this runqueue */ - -#ifdef CONFIG_SMP - /* - * the part of load.weight contributed by tasks - */ - unsigned long task_weight; - - /* - * h_load = weight * f(tg) - * - * Where f(tg) is the recursive weight fraction assigned to - * this group. - */ - unsigned long h_load; - - /* - * Maintaining per-cpu shares distribution for group scheduling - * - * load_stamp is the last time we updated the load average - * load_last is the last time we updated the load average and saw load - * load_unacc_exec_time is currently unaccounted execution time - */ - u64 load_avg; - u64 load_period; - u64 load_stamp, load_last, load_unacc_exec_time; - - unsigned long load_contribution; -#endif -#ifdef CONFIG_CFS_BANDWIDTH - int runtime_enabled; - u64 runtime_expires; - s64 runtime_remaining; - - u64 throttled_timestamp; - int throttled, throttle_count; - struct list_head throttled_list; -#endif -#endif -}; - -#ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_CFS_BANDWIDTH -static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) -{ - return &tg->cfs_bandwidth; -} - -static inline u64 default_cfs_period(void); -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); -static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); - -static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) -{ - struct cfs_bandwidth *cfs_b = - container_of(timer, struct cfs_bandwidth, slack_timer); - do_sched_cfs_slack_timer(cfs_b); - - return HRTIMER_NORESTART; -} - -static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) -{ - struct cfs_bandwidth *cfs_b = - container_of(timer, struct cfs_bandwidth, period_timer); - ktime_t now; - int overrun; - int idle = 0; - - for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, cfs_b->period); - - if (!overrun) - break; - - idle = do_sched_cfs_period_timer(cfs_b, overrun); - } - - return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; -} - -static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -{ - raw_spin_lock_init(&cfs_b->lock); - cfs_b->runtime = 0; - cfs_b->quota = RUNTIME_INF; - cfs_b->period = ns_to_ktime(default_cfs_period()); - - INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cfs_b->period_timer.function = sched_cfs_period_timer; - hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cfs_b->slack_timer.function = sched_cfs_slack_timer; -} - -static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) -{ - cfs_rq->runtime_enabled = 0; - INIT_LIST_HEAD(&cfs_rq->throttled_list); -} - -/* requires cfs_b->lock, may release to reprogram timer */ -static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -{ - /* - * The timer may be active because we're trying to set a new bandwidth - * period or because we're racing with the tear-down path - * (timer_active==0 becomes visible before the hrtimer call-back - * terminates). In either case we ensure that it's re-programmed - */ - while (unlikely(hrtimer_active(&cfs_b->period_timer))) { - raw_spin_unlock(&cfs_b->lock); - /* ensure cfs_b->lock is available while we wait */ - hrtimer_cancel(&cfs_b->period_timer); - - raw_spin_lock(&cfs_b->lock); - /* if someone else restarted the timer then we're done */ - if (cfs_b->timer_active) - return; - } - - cfs_b->timer_active = 1; - start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); -} - -static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) -{ - hrtimer_cancel(&cfs_b->period_timer); - hrtimer_cancel(&cfs_b->slack_timer); -} -#else -static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} -static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} -static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} - -static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) -{ - return NULL; -} -#endif /* CONFIG_CFS_BANDWIDTH */ -#endif /* CONFIG_FAIR_GROUP_SCHED */ - -/* Real-Time classes' related field in a runqueue: */ -struct rt_rq { - struct rt_prio_array active; - unsigned long rt_nr_running; -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED - struct { - int curr; /* highest queued rt task prio */ -#ifdef CONFIG_SMP - int next; /* next highest */ -#endif - } highest_prio; -#endif -#ifdef CONFIG_SMP - unsigned long rt_nr_migratory; - unsigned long rt_nr_total; - int overloaded; - struct plist_head pushable_tasks; -#endif - int rt_throttled; - u64 rt_time; - u64 rt_runtime; - /* Nests inside the rq lock: */ - raw_spinlock_t rt_runtime_lock; - -#ifdef CONFIG_RT_GROUP_SCHED - unsigned long rt_nr_boosted; - - struct rq *rq; - struct list_head leaf_rt_rq_list; - struct task_group *tg; -#endif -}; - -#ifdef CONFIG_SMP - -/* - * We add the notion of a root-domain which will be used to define per-domain - * variables. Each exclusive cpuset essentially defines an island domain by - * fully partitioning the member cpus from any other cpuset. Whenever a new - * exclusive cpuset is created, we also create and attach a new root-domain - * object. - * - */ -struct root_domain { - atomic_t refcount; - atomic_t rto_count; - struct rcu_head rcu; - cpumask_var_t span; - cpumask_var_t online; - - /* - * The "RT overload" flag: it gets set if a CPU has more than - * one runnable RT task. - */ - cpumask_var_t rto_mask; - struct cpupri cpupri; -}; - -/* - * By default the system creates a single root-domain with all cpus as - * members (mimicking the global state we have today). - */ -static struct root_domain def_root_domain; - -#endif /* CONFIG_SMP */ - -/* - * This is the main, per-CPU runqueue data structure. - * - * Locking rule: those places that want to lock multiple runqueues - * (such as the load balancing or the thread migration code), lock - * acquire operations must be ordered by ascending &runqueue. - */ -struct rq { - /* runqueue lock: */ - raw_spinlock_t lock; - - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ - unsigned long nr_running; - #define CPU_LOAD_IDX_MAX 5 - unsigned long cpu_load[CPU_LOAD_IDX_MAX]; - unsigned long last_load_update_tick; -#ifdef CONFIG_NO_HZ - u64 nohz_stamp; - unsigned char nohz_balance_kick; -#endif - int skip_clock_update; - - /* capture load from *all* tasks on this cpu: */ - struct load_weight load; - unsigned long nr_load_updates; - u64 nr_switches; - - struct cfs_rq cfs; - struct rt_rq rt; - -#ifdef CONFIG_FAIR_GROUP_SCHED - /* list of leaf cfs_rq on this cpu: */ - struct list_head leaf_cfs_rq_list; -#endif -#ifdef CONFIG_RT_GROUP_SCHED - struct list_head leaf_rt_rq_list; -#endif - - /* - * This is part of a global counter where only the total sum - * over all CPUs matters. A task can increase this counter on - * one CPU and if it got migrated afterwards it may decrease - * it on another CPU. Always updated under the runqueue lock: - */ - unsigned long nr_uninterruptible; - - struct task_struct *curr, *idle, *stop; - unsigned long next_balance; - struct mm_struct *prev_mm; - - u64 clock; - u64 clock_task; - - atomic_t nr_iowait; - -#ifdef CONFIG_SMP - struct root_domain *rd; - struct sched_domain *sd; - - unsigned long cpu_power; - - unsigned char idle_balance; - /* For active balancing */ - int post_schedule; - int active_balance; - int push_cpu; - struct cpu_stop_work active_balance_work; - /* cpu of this runqueue: */ - int cpu; - int online; - - u64 rt_avg; - u64 age_stamp; - u64 idle_stamp; - u64 avg_idle; -#endif - -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - u64 prev_irq_time; -#endif -#ifdef CONFIG_PARAVIRT - u64 prev_steal_time; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - u64 prev_steal_time_rq; -#endif - - /* calc_load related fields */ - unsigned long calc_load_update; - long calc_load_active; - -#ifdef CONFIG_SCHED_HRTICK -#ifdef CONFIG_SMP - int hrtick_csd_pending; - struct call_single_data hrtick_csd; -#endif - struct hrtimer hrtick_timer; -#endif - -#ifdef CONFIG_SCHEDSTATS - /* latency stats */ - struct sched_info rq_sched_info; - unsigned long long rq_cpu_time; - /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ - - /* sys_sched_yield() stats */ - unsigned int yld_count; - - /* schedule() stats */ - unsigned int sched_switch; - unsigned int sched_count; - unsigned int sched_goidle; - - /* try_to_wake_up() stats */ - unsigned int ttwu_count; - unsigned int ttwu_local; -#endif - -#ifdef CONFIG_SMP - struct llist_head wake_list; -#endif -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); - - -static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); - -static inline int cpu_of(struct rq *rq) -{ -#ifdef CONFIG_SMP - return rq->cpu; -#else - return 0; -#endif -} - -#define rcu_dereference_check_sched_domain(p) \ - rcu_dereference_check((p), \ - lockdep_is_held(&sched_domains_mutex)) - -/* - * The domain tree (rq->sd) is protected by RCU's quiescent state transition. - * See detach_destroy_domains: synchronize_sched for details. - * - * The domain tree of any CPU may only be accessed from within - * preempt-disabled sections. - */ -#define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) - -#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -#define this_rq() (&__get_cpu_var(runqueues)) -#define task_rq(p) cpu_rq(task_cpu(p)) -#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -#define raw_rq() (&__raw_get_cpu_var(runqueues)) - -#ifdef CONFIG_CGROUP_SCHED - -/* - * Return the group to which this tasks belongs. - * - * We use task_subsys_state_check() and extend the RCU verification with - * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each - * task it moves into the cgroup. Therefore by holding either of those locks, - * we pin the task to the current cgroup. - */ -static inline struct task_group *task_group(struct task_struct *p) -{ - struct task_group *tg; - struct cgroup_subsys_state *css; - - css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&task_rq(p)->lock)); - tg = container_of(css, struct task_group, css); - - return autogroup_task_group(p, tg); -} - -/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) -{ -#ifdef CONFIG_FAIR_GROUP_SCHED - p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; - p->se.parent = task_group(p)->se[cpu]; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - p->rt.rt_rq = task_group(p)->rt_rq[cpu]; - p->rt.parent = task_group(p)->rt_se[cpu]; -#endif -} - -#else /* CONFIG_CGROUP_SCHED */ - -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline struct task_group *task_group(struct task_struct *p) -{ - return NULL; -} - -#endif /* CONFIG_CGROUP_SCHED */ +DEFINE_MUTEX(sched_domains_mutex); +DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static void update_rq_clock_task(struct rq *rq, s64 delta); -static void update_rq_clock(struct rq *rq) +void update_rq_clock(struct rq *rq) { s64 delta; @@ -803,44 +122,14 @@ static void update_rq_clock(struct rq *rq) } /* - * Tunables that become constants when CONFIG_SCHED_DEBUG is off: - */ -#ifdef CONFIG_SCHED_DEBUG -# define const_debug __read_mostly -#else -# define const_debug static const -#endif - -/** - * runqueue_is_locked - Returns true if the current cpu runqueue is locked - * @cpu: the processor in question. - * - * This interface allows printk to be called with the runqueue lock - * held and know whether or not it is OK to wake up the klogd. - */ -int runqueue_is_locked(int cpu) -{ - return raw_spin_is_locked(&cpu_rq(cpu)->lock); -} - -/* * Debugging: various feature bits */ #define SCHED_FEAT(name, enabled) \ - __SCHED_FEAT_##name , - -enum { -#include "sched_features.h" -}; - -#undef SCHED_FEAT - -#define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | const_debug unsigned int sysctl_sched_features = -#include "sched_features.h" +#include "features.h" 0; #undef SCHED_FEAT @@ -850,7 +139,7 @@ const_debug unsigned int sysctl_sched_features = #name , static __read_mostly char *sched_feat_names[] = { -#include "sched_features.h" +#include "features.h" NULL }; @@ -860,7 +149,7 @@ static int sched_feat_show(struct seq_file *m, void *v) { int i; - for (i = 0; sched_feat_names[i]; i++) { + for (i = 0; i < __SCHED_FEAT_NR; i++) { if (!(sysctl_sched_features & (1UL << i))) seq_puts(m, "NO_"); seq_printf(m, "%s ", sched_feat_names[i]); @@ -870,6 +159,36 @@ static int sched_feat_show(struct seq_file *m, void *v) return 0; } +#ifdef HAVE_JUMP_LABEL + +#define jump_label_key__true jump_label_key_enabled +#define jump_label_key__false jump_label_key_disabled + +#define SCHED_FEAT(name, enabled) \ + jump_label_key__##enabled , + +struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static void sched_feat_disable(int i) +{ + if (jump_label_enabled(&sched_feat_keys[i])) + jump_label_dec(&sched_feat_keys[i]); +} + +static void sched_feat_enable(int i) +{ + if (!jump_label_enabled(&sched_feat_keys[i])) + jump_label_inc(&sched_feat_keys[i]); +} +#else +static void sched_feat_disable(int i) { }; +static void sched_feat_enable(int i) { }; +#endif /* HAVE_JUMP_LABEL */ + static ssize_t sched_feat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -893,17 +212,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf, cmp += 3; } - for (i = 0; sched_feat_names[i]; i++) { + for (i = 0; i < __SCHED_FEAT_NR; i++) { if (strcmp(cmp, sched_feat_names[i]) == 0) { - if (neg) + if (neg) { sysctl_sched_features &= ~(1UL << i); - else + sched_feat_disable(i); + } else { sysctl_sched_features |= (1UL << i); + sched_feat_enable(i); + } break; } } - if (!sched_feat_names[i]) + if (i == __SCHED_FEAT_NR) return -EINVAL; *ppos += cnt; @@ -932,10 +254,7 @@ static __init int sched_init_debug(void) return 0; } late_initcall(sched_init_debug); - -#endif - -#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#endif /* CONFIG_SCHED_DEBUG */ /* * Number of tasks to iterate in a single balance run. @@ -957,7 +276,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; */ unsigned int sysctl_sched_rt_period = 1000000; -static __read_mostly int scheduler_running; +__read_mostly int scheduler_running; /* * part of the period that we allow rt tasks to run in us. @@ -965,112 +284,7 @@ static __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -static inline u64 global_rt_period(void) -{ - return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; -} -static inline u64 global_rt_runtime(void) -{ - if (sysctl_sched_rt_runtime < 0) - return RUNTIME_INF; - - return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; -} - -#ifndef prepare_arch_switch -# define prepare_arch_switch(next) do { } while (0) -#endif -#ifndef finish_arch_switch -# define finish_arch_switch(prev) do { } while (0) -#endif - -static inline int task_current(struct rq *rq, struct task_struct *p) -{ - return rq->curr == p; -} - -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->on_cpu; -#else - return task_current(rq, p); -#endif -} - -#ifndef __ARCH_WANT_UNLOCKED_CTXSW -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->on_cpu = 0; -#endif -#ifdef CONFIG_DEBUG_SPINLOCK - /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; -#endif - /* - * If we are tracking spinlock dependencies then we have to - * fix up the runqueue lock - which gets 'carried over' from - * prev into current: - */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - - raw_spin_unlock_irq(&rq->lock); -} - -#else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif -#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW - raw_spin_unlock_irq(&rq->lock); -#else - raw_spin_unlock(&rq->lock); -#endif -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - */ - smp_wmb(); - prev->on_cpu = 0; -#endif -#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW - local_irq_enable(); -#endif -} -#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* * __task_rq_lock - lock the rq @p resides on. @@ -1153,20 +367,6 @@ static struct rq *this_rq_lock(void) * rq->lock. */ -/* - * Use hrtick when: - * - enabled by features - * - hrtimer is actually high res - */ -static inline int hrtick_enabled(struct rq *rq) -{ - if (!sched_feat(HRTICK)) - return 0; - if (!cpu_active(cpu_of(rq))) - return 0; - return hrtimer_is_hres_active(&rq->hrtick_timer); -} - static void hrtick_clear(struct rq *rq) { if (hrtimer_active(&rq->hrtick_timer)) @@ -1210,7 +410,7 @@ static void __hrtick_start(void *arg) * * called with rq->lock held and irqs disabled */ -static void hrtick_start(struct rq *rq, u64 delay) +void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; ktime_t time = ktime_add_ns(timer->base->get_time(), delay); @@ -1254,7 +454,7 @@ static __init void init_hrtick(void) * * called with rq->lock held and irqs disabled */ -static void hrtick_start(struct rq *rq, u64 delay) +void hrtick_start(struct rq *rq, u64 delay) { __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, HRTIMER_MODE_REL_PINNED, 0); @@ -1305,7 +505,7 @@ static inline void init_hrtick(void) #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif -static void resched_task(struct task_struct *p) +void resched_task(struct task_struct *p) { int cpu; @@ -1326,7 +526,7 @@ static void resched_task(struct task_struct *p) smp_send_reschedule(cpu); } -static void resched_cpu(int cpu) +void resched_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -1407,7 +607,8 @@ void wake_up_idle_cpu(int cpu) static inline bool got_nohz_idle_kick(void) { - return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick; + int cpu = smp_processor_id(); + return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); } #else /* CONFIG_NO_HZ */ @@ -1419,12 +620,7 @@ static inline bool got_nohz_idle_kick(void) #endif /* CONFIG_NO_HZ */ -static u64 sched_avg_period(void) -{ - return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; -} - -static void sched_avg_update(struct rq *rq) +void sched_avg_update(struct rq *rq) { s64 period = sched_avg_period(); @@ -1440,193 +636,23 @@ static void sched_avg_update(struct rq *rq) } } -static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) -{ - rq->rt_avg += rt_delta; - sched_avg_update(rq); -} - #else /* !CONFIG_SMP */ -static void resched_task(struct task_struct *p) +void resched_task(struct task_struct *p) { assert_raw_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } - -static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) -{ -} - -static void sched_avg_update(struct rq *rq) -{ -} #endif /* CONFIG_SMP */ -#if BITS_PER_LONG == 32 -# define WMULT_CONST (~0UL) -#else -# define WMULT_CONST (1UL << 32) -#endif - -#define WMULT_SHIFT 32 - -/* - * Shift right and round: - */ -#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) - -/* - * delta *= weight / lw - */ -static unsigned long -calc_delta_mine(unsigned long delta_exec, unsigned long weight, - struct load_weight *lw) -{ - u64 tmp; - - /* - * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched - * entities since MIN_SHARES = 2. Treat weight as 1 if less than - * 2^SCHED_LOAD_RESOLUTION. - */ - if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) - tmp = (u64)delta_exec * scale_load_down(weight); - else - tmp = (u64)delta_exec; - - if (!lw->inv_weight) { - unsigned long w = scale_load_down(lw->weight); - - if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) - lw->inv_weight = 1; - else if (unlikely(!w)) - lw->inv_weight = WMULT_CONST; - else - lw->inv_weight = WMULT_CONST / w; - } - - /* - * Check whether we'd overflow the 64-bit multiplication: - */ - if (unlikely(tmp > WMULT_CONST)) - tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, - WMULT_SHIFT/2); - else - tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); - - return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); -} - -static inline void update_load_add(struct load_weight *lw, unsigned long inc) -{ - lw->weight += inc; - lw->inv_weight = 0; -} - -static inline void update_load_sub(struct load_weight *lw, unsigned long dec) -{ - lw->weight -= dec; - lw->inv_weight = 0; -} - -static inline void update_load_set(struct load_weight *lw, unsigned long w) -{ - lw->weight = w; - lw->inv_weight = 0; -} - -/* - * To aid in avoiding the subversion of "niceness" due to uneven distribution - * of tasks with abnormal "nice" values across CPUs the contribution that - * each task makes to its run queue's load is weighted according to its - * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a - * scaled version of the new time slice allocation that they receive on time - * slice expiry etc. - */ - -#define WEIGHT_IDLEPRIO 3 -#define WMULT_IDLEPRIO 1431655765 - -/* - * Nice levels are multiplicative, with a gentle 10% change for every - * nice level changed. I.e. when a CPU-bound task goes from nice 0 to - * nice 1, it will get ~10% less CPU time than another CPU-bound task - * that remained on nice 0. - * - * The "10% effect" is relative and cumulative: from _any_ nice level, - * if you go up 1 level, it's -10% CPU usage, if you go down 1 level - * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. - * If a task goes up by ~10% and another task goes down by ~10% then - * the relative distance between them is ~25%.) - */ -static const int prio_to_weight[40] = { - /* -20 */ 88761, 71755, 56483, 46273, 36291, - /* -15 */ 29154, 23254, 18705, 14949, 11916, - /* -10 */ 9548, 7620, 6100, 4904, 3906, - /* -5 */ 3121, 2501, 1991, 1586, 1277, - /* 0 */ 1024, 820, 655, 526, 423, - /* 5 */ 335, 272, 215, 172, 137, - /* 10 */ 110, 87, 70, 56, 45, - /* 15 */ 36, 29, 23, 18, 15, -}; - -/* - * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. - * - * In cases where the weight does not change often, we can use the - * precalculated inverse to speed up arithmetics by turning divisions - * into multiplications: - */ -static const u32 prio_to_wmult[40] = { - /* -20 */ 48388, 59856, 76040, 92818, 118348, - /* -15 */ 147320, 184698, 229616, 287308, 360437, - /* -10 */ 449829, 563644, 704093, 875809, 1099582, - /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, - /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, - /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, - /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, - /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, -}; - -/* Time spent by the tasks of the cpu accounting group executing in ... */ -enum cpuacct_stat_index { - CPUACCT_STAT_USER, /* ... user mode */ - CPUACCT_STAT_SYSTEM, /* ... kernel mode */ - - CPUACCT_STAT_NSTATS, -}; - -#ifdef CONFIG_CGROUP_CPUACCT -static void cpuacct_charge(struct task_struct *tsk, u64 cputime); -static void cpuacct_update_stats(struct task_struct *tsk, - enum cpuacct_stat_index idx, cputime_t val); -#else -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} -static inline void cpuacct_update_stats(struct task_struct *tsk, - enum cpuacct_stat_index idx, cputime_t val) {} -#endif - -static inline void inc_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_add(&rq->load, load); -} - -static inline void dec_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_sub(&rq->load, load); -} - #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH))) -typedef int (*tg_visitor)(struct task_group *, void *); - /* * Iterate task_group tree rooted at *from, calling @down when first entering a * node and @up when leaving it for the final time. * * Caller must hold rcu_lock or sufficient equivalent. */ -static int walk_tg_tree_from(struct task_group *from, +int walk_tg_tree_from(struct task_group *from, tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; @@ -1657,270 +683,13 @@ out: return ret; } -/* - * Iterate the full tree, calling @down when first entering a node and @up when - * leaving it for the final time. - * - * Caller must hold rcu_lock or sufficient equivalent. - */ - -static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) -{ - return walk_tg_tree_from(&root_task_group, down, up, data); -} - -static int tg_nop(struct task_group *tg, void *data) +int tg_nop(struct task_group *tg, void *data) { return 0; } #endif -#ifdef CONFIG_SMP -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ - return cpu_rq(cpu)->load.weight; -} - -/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long total = weighted_cpuload(cpu); - - if (type == 0 || !sched_feat(LB_BIAS)) - return total; - - return max(rq->cpu_load[type-1], total); -} - -static unsigned long power_of(int cpu) -{ - return cpu_rq(cpu)->cpu_power; -} - -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = ACCESS_ONCE(rq->nr_running); - - if (nr_running) - return rq->load.weight / nr_running; - - return 0; -} - -#ifdef CONFIG_PREEMPT - -static void double_rq_lock(struct rq *rq1, struct rq *rq2); - -/* - * fair double_lock_balance: Safely acquires both rq->locks in a fair - * way at the expense of forcing extra atomic operations in all - * invocations. This assures that the double_lock is acquired using the - * same underlying policy as the spinlock_t on this architecture, which - * reduces latency compared to the unfair variant below. However, it - * also adds more overhead and therefore may reduce throughput. - */ -static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) -{ - raw_spin_unlock(&this_rq->lock); - double_rq_lock(this_rq, busiest); - - return 1; -} - -#else -/* - * Unfair double_lock_balance: Optimizes throughput at the expense of - * latency by eliminating extra atomic operations when the locks are - * already in proper order on entry. This favors lower cpu-ids and will - * grant the double lock to lower cpus over higher ids under contention, - * regardless of entry order into the function. - */ -static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) - __releases(this_rq->lock) - __acquires(busiest->lock) - __acquires(this_rq->lock) -{ - int ret = 0; - - if (unlikely(!raw_spin_trylock(&busiest->lock))) { - if (busiest < this_rq) { - raw_spin_unlock(&this_rq->lock); - raw_spin_lock(&busiest->lock); - raw_spin_lock_nested(&this_rq->lock, - SINGLE_DEPTH_NESTING); - ret = 1; - } else - raw_spin_lock_nested(&busiest->lock, - SINGLE_DEPTH_NESTING); - } - return ret; -} - -#endif /* CONFIG_PREEMPT */ - -/* - * double_lock_balance - lock the busiest runqueue, this_rq is locked already. - */ -static int double_lock_balance(struct rq *this_rq, struct rq *busiest) -{ - if (unlikely(!irqs_disabled())) { - /* printk() doesn't work good under rq->lock */ - raw_spin_unlock(&this_rq->lock); - BUG_ON(1); - } - - return _double_lock_balance(this_rq, busiest); -} - -static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) - __releases(busiest->lock) -{ - raw_spin_unlock(&busiest->lock); - lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); -} - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - BUG_ON(!irqs_disabled()); - if (rq1 == rq2) { - raw_spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ - } else { - if (rq1 < rq2) { - raw_spin_lock(&rq1->lock); - raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); - } else { - raw_spin_lock(&rq2->lock); - raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); - } - } -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - raw_spin_unlock(&rq1->lock); - if (rq1 != rq2) - raw_spin_unlock(&rq2->lock); - else - __release(rq2->lock); -} - -#else /* CONFIG_SMP */ - -/* - * double_rq_lock - safely lock two runqueues - * - * Note this does not disable interrupts like task_rq_lock, - * you need to do so manually before calling. - */ -static void double_rq_lock(struct rq *rq1, struct rq *rq2) - __acquires(rq1->lock) - __acquires(rq2->lock) -{ - BUG_ON(!irqs_disabled()); - BUG_ON(rq1 != rq2); - raw_spin_lock(&rq1->lock); - __acquire(rq2->lock); /* Fake it out ;) */ -} - -/* - * double_rq_unlock - safely unlock two runqueues - * - * Note this does not restore interrupts like task_rq_unlock, - * you need to do so manually after calling. - */ -static void double_rq_unlock(struct rq *rq1, struct rq *rq2) - __releases(rq1->lock) - __releases(rq2->lock) -{ - BUG_ON(rq1 != rq2); - raw_spin_unlock(&rq1->lock); - __release(rq2->lock); -} - -#endif - -static void calc_load_account_idle(struct rq *this_rq); -static void update_sysctl(void); -static int get_update_sysctl_factor(void); -static void update_cpu_load(struct rq *this_rq); - -static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -{ - set_task_rq(p, cpu); -#ifdef CONFIG_SMP - /* - * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfully executed on another CPU. We must ensure that updates of - * per-task data have been completed by this moment. - */ - smp_wmb(); - task_thread_info(p)->cpu = cpu; -#endif -} - -static const struct sched_class rt_sched_class; - -#define sched_class_highest (&stop_sched_class) -#define for_each_class(class) \ - for (class = sched_class_highest; class; class = class->next) - -#include "sched_stats.h" - -static void inc_nr_running(struct rq *rq) -{ - rq->nr_running++; -} - -static void dec_nr_running(struct rq *rq) -{ - rq->nr_running--; -} +void update_cpu_load(struct rq *this_rq); static void set_load_weight(struct task_struct *p) { @@ -1957,7 +726,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) /* * activate_task - move a task to the runqueue. */ -static void activate_task(struct rq *rq, struct task_struct *p, int flags) +void activate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible--; @@ -1968,7 +737,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags) /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) +void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible++; @@ -2159,14 +928,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) #ifdef CONFIG_IRQ_TIME_ACCOUNTING static int irqtime_account_hi_update(void) { - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; u64 latest_ns; int ret = 0; local_irq_save(flags); latest_ns = this_cpu_read(cpu_hardirq_time); - if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) ret = 1; local_irq_restore(flags); return ret; @@ -2174,14 +943,14 @@ static int irqtime_account_hi_update(void) static int irqtime_account_si_update(void) { - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; u64 latest_ns; int ret = 0; local_irq_save(flags); latest_ns = this_cpu_read(cpu_softirq_time); - if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) + if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) ret = 1; local_irq_restore(flags); return ret; @@ -2193,15 +962,6 @@ static int irqtime_account_si_update(void) #endif -#include "sched_idletask.c" -#include "sched_fair.c" -#include "sched_rt.c" -#include "sched_autogroup.c" -#include "sched_stoptask.c" -#ifdef CONFIG_SCHED_DEBUG -# include "sched_debug.c" -#endif - void sched_set_stop_task(int cpu, struct task_struct *stop) { struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; @@ -2299,7 +1059,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio); } -static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) { const struct sched_class *class; @@ -2325,38 +1085,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP -/* - * Is this task likely cache-hot: - */ -static int -task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) -{ - s64 delta; - - if (p->sched_class != &fair_sched_class) - return 0; - - if (unlikely(p->policy == SCHED_IDLE)) - return 0; - - /* - * Buddy candidates are cache hot: - */ - if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && - (&p->se == cfs_rq_of(&p->se)->next || - &p->se == cfs_rq_of(&p->se)->last)) - return 1; - - if (sysctl_sched_migration_cost == -1) - return 1; - if (sysctl_sched_migration_cost == 0) - return 0; - - delta = now - p->se.exec_start; - - return delta < (s64)sysctl_sched_migration_cost; -} - void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { #ifdef CONFIG_SCHED_DEBUG @@ -2783,6 +1511,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags) } #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ + +static inline int ttwu_share_cache(int this_cpu, int that_cpu) +{ + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); +} #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu) @@ -2790,7 +1523,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) struct rq *rq = cpu_rq(cpu); #if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { + if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; @@ -3204,6 +1937,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); + trace_sched_stat_sleeptime(current, rq->clock); fire_sched_in_preempt_notifiers(current); if (mm) @@ -3439,7 +2173,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) */ static atomic_long_t calc_load_tasks_idle; -static void calc_load_account_idle(struct rq *this_rq) +void calc_load_account_idle(struct rq *this_rq) { long delta; @@ -3583,7 +2317,7 @@ static void calc_global_nohz(unsigned long ticks) */ } #else -static void calc_load_account_idle(struct rq *this_rq) +void calc_load_account_idle(struct rq *this_rq) { } @@ -3726,7 +2460,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) * scheduler tick (TICK_NSEC). With tickless idle this will not be called * every tick. We fix it up based on jiffies. */ -static void update_cpu_load(struct rq *this_rq) +void update_cpu_load(struct rq *this_rq) { unsigned long this_load = this_rq->load.weight; unsigned long curr_jiffies = jiffies; @@ -3804,8 +2538,10 @@ unlock: #endif DEFINE_PER_CPU(struct kernel_stat, kstat); +DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); EXPORT_PER_CPU_SYMBOL(kstat); +EXPORT_PER_CPU_SYMBOL(kernel_cpustat); /* * Return any ns on the sched_clock that have not yet been accounted in @@ -3858,6 +2594,42 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } +#ifdef CONFIG_CGROUP_CPUACCT +struct cgroup_subsys cpuacct_subsys; +struct cpuacct root_cpuacct; +#endif + +static inline void task_group_account_field(struct task_struct *p, int index, + u64 tmp) +{ +#ifdef CONFIG_CGROUP_CPUACCT + struct kernel_cpustat *kcpustat; + struct cpuacct *ca; +#endif + /* + * Since all updates are sure to touch the root cgroup, we + * get ourselves ahead and touch it first. If the root cgroup + * is the only cgroup, then nothing else should be necessary. + * + */ + __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; + +#ifdef CONFIG_CGROUP_CPUACCT + if (unlikely(!cpuacct_subsys.active)) + return; + + rcu_read_lock(); + ca = task_ca(p); + while (ca && (ca != &root_cpuacct)) { + kcpustat = this_cpu_ptr(ca->cpustat); + kcpustat->cpustat[index] += tmp; + ca = parent_ca(ca); + } + rcu_read_unlock(); +#endif +} + + /* * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to @@ -3867,22 +2639,18 @@ unsigned long long task_sched_runtime(struct task_struct *p) void account_user_time(struct task_struct *p, cputime_t cputime, cputime_t cputime_scaled) { - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp; + int index; /* Add user time to process. */ - p->utime = cputime_add(p->utime, cputime); - p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); + p->utime += cputime; + p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); + index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + /* Add user time to cpustat. */ - tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) - cpustat->nice = cputime64_add(cpustat->nice, tmp); - else - cpustat->user = cputime64_add(cpustat->user, tmp); + task_group_account_field(p, index, (__force u64) cputime); - cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); /* Account for user time used */ acct_update_integrals(p); } @@ -3896,24 +2664,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime, static void account_guest_time(struct task_struct *p, cputime_t cputime, cputime_t cputime_scaled) { - cputime64_t tmp; - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - - tmp = cputime_to_cputime64(cputime); + u64 *cpustat = kcpustat_this_cpu->cpustat; /* Add guest time to process. */ - p->utime = cputime_add(p->utime, cputime); - p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); + p->utime += cputime; + p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); - p->gtime = cputime_add(p->gtime, cputime); + p->gtime += cputime; /* Add guest time to cpustat. */ if (TASK_NICE(p) > 0) { - cpustat->nice = cputime64_add(cpustat->nice, tmp); - cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); + cpustat[CPUTIME_NICE] += (__force u64) cputime; + cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; } else { - cpustat->user = cputime64_add(cpustat->user, tmp); - cpustat->guest = cputime64_add(cpustat->guest, tmp); + cpustat[CPUTIME_USER] += (__force u64) cputime; + cpustat[CPUTIME_GUEST] += (__force u64) cputime; } } @@ -3926,18 +2691,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, */ static inline void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, cputime64_t *target_cputime64) + cputime_t cputime_scaled, int index) { - cputime64_t tmp = cputime_to_cputime64(cputime); - /* Add system time to process. */ - p->stime = cputime_add(p->stime, cputime); - p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); + p->stime += cputime; + p->stimescaled += cputime_scaled; account_group_system_time(p, cputime); /* Add system time to cpustat. */ - *target_cputime64 = cputime64_add(*target_cputime64, tmp); - cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); + task_group_account_field(p, index, (__force u64) cputime); /* Account for system time used */ acct_update_integrals(p); @@ -3953,8 +2715,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, void account_system_time(struct task_struct *p, int hardirq_offset, cputime_t cputime, cputime_t cputime_scaled) { - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t *target_cputime64; + int index; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { account_guest_time(p, cputime, cputime_scaled); @@ -3962,13 +2723,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset, } if (hardirq_count() - hardirq_offset) - target_cputime64 = &cpustat->irq; + index = CPUTIME_IRQ; else if (in_serving_softirq()) - target_cputime64 = &cpustat->softirq; + index = CPUTIME_SOFTIRQ; else - target_cputime64 = &cpustat->system; + index = CPUTIME_SYSTEM; - __account_system_time(p, cputime, cputime_scaled, target_cputime64); + __account_system_time(p, cputime, cputime_scaled, index); } /* @@ -3977,10 +2738,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset, */ void account_steal_time(cputime_t cputime) { - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t cputime64 = cputime_to_cputime64(cputime); + u64 *cpustat = kcpustat_this_cpu->cpustat; - cpustat->steal = cputime64_add(cpustat->steal, cputime64); + cpustat[CPUTIME_STEAL] += (__force u64) cputime; } /* @@ -3989,14 +2749,13 @@ void account_steal_time(cputime_t cputime) */ void account_idle_time(cputime_t cputime) { - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t cputime64 = cputime_to_cputime64(cputime); + u64 *cpustat = kcpustat_this_cpu->cpustat; struct rq *rq = this_rq(); if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); + cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; else - cpustat->idle = cputime64_add(cpustat->idle, cputime64); + cpustat[CPUTIME_IDLE] += (__force u64) cputime; } static __always_inline bool steal_account_process_tick(void) @@ -4046,16 +2805,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq) { cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); - cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); - struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + u64 *cpustat = kcpustat_this_cpu->cpustat; if (steal_account_process_tick()) return; if (irqtime_account_hi_update()) { - cpustat->irq = cputime64_add(cpustat->irq, tmp); + cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; } else if (irqtime_account_si_update()) { - cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; } else if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. @@ -4063,7 +2821,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, * Also, p->stime needs to be updated for ksoftirqd. */ __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - &cpustat->softirq); + CPUTIME_SOFTIRQ); } else if (user_tick) { account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); } else if (p == rq->idle) { @@ -4072,7 +2830,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); } else { __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - &cpustat->system); + CPUTIME_SYSTEM); } } @@ -4171,7 +2929,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) { - cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); + cputime_t rtime, utime = p->utime, total = utime + p->stime; /* * Use CFS's precise accounting: @@ -4179,11 +2937,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) rtime = nsecs_to_cputime(p->se.sum_exec_runtime); if (total) { - u64 temp = rtime; + u64 temp = (__force u64) rtime; - temp *= utime; - do_div(temp, total); - utime = (cputime_t)temp; + temp *= (__force u64) utime; + do_div(temp, (__force u32) total); + utime = (__force cputime_t) temp; } else utime = rtime; @@ -4191,7 +2949,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) * Compare with previous values, to keep monotonicity: */ p->prev_utime = max(p->prev_utime, utime); - p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); + p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); *ut = p->prev_utime; *st = p->prev_stime; @@ -4208,21 +2966,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) thread_group_cputime(p, &cputime); - total = cputime_add(cputime.utime, cputime.stime); + total = cputime.utime + cputime.stime; rtime = nsecs_to_cputime(cputime.sum_exec_runtime); if (total) { - u64 temp = rtime; + u64 temp = (__force u64) rtime; - temp *= cputime.utime; - do_div(temp, total); - utime = (cputime_t)temp; + temp *= (__force u64) cputime.utime; + do_div(temp, (__force u32) total); + utime = (__force cputime_t) temp; } else utime = rtime; sig->prev_utime = max(sig->prev_utime, utime); - sig->prev_stime = max(sig->prev_stime, - cputime_sub(rtime, sig->prev_utime)); + sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); *ut = sig->prev_utime; *st = sig->prev_stime; @@ -4321,6 +3078,9 @@ static noinline void __schedule_bug(struct task_struct *prev) { struct pt_regs *regs = get_irq_regs(); + if (oops_in_progress) + return; + printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", prev->comm, prev->pid, preempt_count()); @@ -5852,6 +4612,13 @@ again: */ if (preempt && rq != p_rq) resched_task(p_rq->curr); + } else { + /* + * We might have set it in task_yield_fair(), but are + * not going to schedule(), so don't want to skip + * the next update. + */ + rq->skip_clock_update = 0; } out: @@ -6019,7 +4786,7 @@ void sched_show_task(struct task_struct *p) free = stack_not_used(p); #endif printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, - task_pid_nr(p), task_pid_nr(p->real_parent), + task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)), (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); @@ -6118,53 +4885,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) #endif } -/* - * Increase the granularity value when there are more CPUs, - * because with more CPUs the 'effective latency' as visible - * to users decreases. But the relationship is not linear, - * so pick a second-best guess by going with the log2 of the - * number of CPUs. - * - * This idea comes from the SD scheduler of Con Kolivas: - */ -static int get_update_sysctl_factor(void) -{ - unsigned int cpus = min_t(int, num_online_cpus(), 8); - unsigned int factor; - - switch (sysctl_sched_tunable_scaling) { - case SCHED_TUNABLESCALING_NONE: - factor = 1; - break; - case SCHED_TUNABLESCALING_LINEAR: - factor = cpus; - break; - case SCHED_TUNABLESCALING_LOG: - default: - factor = 1 + ilog2(cpus); - break; - } - - return factor; -} - -static void update_sysctl(void) -{ - unsigned int factor = get_update_sysctl_factor(); - -#define SET_SYSCTL(name) \ - (sysctl_##name = (factor) * normalized_sysctl_##name) - SET_SYSCTL(sched_min_granularity); - SET_SYSCTL(sched_latency); - SET_SYSCTL(sched_wakeup_granularity); -#undef SET_SYSCTL -} - -static inline void sched_init_granularity(void) -{ - update_sysctl(); -} - #ifdef CONFIG_SMP void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { @@ -6351,30 +5071,6 @@ static void calc_global_load_remove(struct rq *rq) rq->calc_load_active = 0; } -#ifdef CONFIG_CFS_BANDWIDTH -static void unthrottle_offline_cfs_rqs(struct rq *rq) -{ - struct cfs_rq *cfs_rq; - - for_each_leaf_cfs_rq(rq, cfs_rq) { - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); - - if (!cfs_rq->runtime_enabled) - continue; - - /* - * clock_task is not advancing so we just need to make sure - * there's some valid quota amount - */ - cfs_rq->runtime_remaining = cfs_b->quota; - if (cfs_rq_throttled(cfs_rq)) - unthrottle_cfs_rq(cfs_rq); - } -} -#else -static void unthrottle_offline_cfs_rqs(struct rq *rq) {} -#endif - /* * Migrate all tasks from the rq, sleeping tasks will be migrated by * try_to_wake_up()->select_task_rq(). @@ -6980,6 +5676,12 @@ out: return -ENOMEM; } +/* + * By default the system creates a single root-domain with all cpus as + * members (mimicking the global state we have today). + */ +struct root_domain def_root_domain; + static void init_defrootdomain(void) { init_rootdomain(&def_root_domain); @@ -7051,6 +5753,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) } /* + * Keep a special pointer to the highest sched_domain that has + * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this + * allows us to avoid some pointer chasing select_idle_sibling(). + * + * Also keep a unique ID per domain (we use the first cpu number in + * the cpumask of the domain), this allows us to quickly tell if + * two cpus are in the same cache domain, see ttwu_share_cache(). + */ +DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(int, sd_llc_id); + +static void update_top_cache_domain(int cpu) +{ + struct sched_domain *sd; + int id = cpu; + + sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); + if (sd) + id = cpumask_first(sched_domain_span(sd)); + + rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); + per_cpu(sd_llc_id, cpu) = id; +} + +/* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ @@ -7089,6 +5816,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp = rq->sd; rcu_assign_pointer(rq->sd, sd); destroy_sched_domains(tmp, cpu); + + update_top_cache_domain(cpu); } /* cpus with isolated domains */ @@ -7248,7 +5977,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) continue; sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, cpu_to_node(i)); + GFP_KERNEL, cpu_to_node(cpu)); if (!sg) goto fail; @@ -7386,6 +6115,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) return; update_group_power(sd, cpu); + atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight); +} + +int __weak arch_sd_sibling_asym_packing(void) +{ + return 0*SD_ASYM_PACKING; } /* @@ -8023,29 +6758,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, } } -static int update_runtime(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (int)(long)hcpu; - - switch (action) { - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - disable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - enable_runtime(cpu_rq(cpu)); - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } -} - void __init sched_init_smp(void) { cpumask_var_t non_isolated_cpus; @@ -8094,104 +6806,11 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } -static void init_cfs_rq(struct cfs_rq *cfs_rq) -{ - cfs_rq->tasks_timeline = RB_ROOT; - INIT_LIST_HEAD(&cfs_rq->tasks); - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); -#ifndef CONFIG_64BIT - cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; -#endif -} - -static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) -{ - struct rt_prio_array *array; - int i; - - array = &rt_rq->active; - for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); - __clear_bit(i, array->bitmap); - } - /* delimiter for bitsearch: */ - __set_bit(MAX_RT_PRIO, array->bitmap); - -#if defined CONFIG_SMP - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->highest_prio.next = MAX_RT_PRIO; - rt_rq->rt_nr_migratory = 0; - rt_rq->overloaded = 0; - plist_head_init(&rt_rq->pushable_tasks); -#endif - - rt_rq->rt_time = 0; - rt_rq->rt_throttled = 0; - rt_rq->rt_runtime = 0; - raw_spin_lock_init(&rt_rq->rt_runtime_lock); -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, - struct sched_entity *se, int cpu, - struct sched_entity *parent) -{ - struct rq *rq = cpu_rq(cpu); - - cfs_rq->tg = tg; - cfs_rq->rq = rq; -#ifdef CONFIG_SMP - /* allow initial update_cfs_load() to truncate */ - cfs_rq->load_stamp = 1; -#endif - init_cfs_rq_runtime(cfs_rq); - - tg->cfs_rq[cpu] = cfs_rq; - tg->se[cpu] = se; - - /* se could be NULL for root_task_group */ - if (!se) - return; - - if (!parent) - se->cfs_rq = &rq->cfs; - else - se->cfs_rq = parent->my_q; - - se->my_q = cfs_rq; - update_load_set(&se->load, 0); - se->parent = parent; -} +#ifdef CONFIG_CGROUP_SCHED +struct task_group root_task_group; #endif -#ifdef CONFIG_RT_GROUP_SCHED -static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, - struct sched_rt_entity *rt_se, int cpu, - struct sched_rt_entity *parent) -{ - struct rq *rq = cpu_rq(cpu); - - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->rt_nr_boosted = 0; - rt_rq->rq = rq; - rt_rq->tg = tg; - - tg->rt_rq[cpu] = rt_rq; - tg->rt_se[cpu] = rt_se; - - if (!rt_se) - return; - - if (!parent) - rt_se->rt_rq = &rq->rt; - else - rt_se->rt_rq = parent->my_q; - - rt_se->my_q = rt_rq; - rt_se->parent = parent; - INIT_LIST_HEAD(&rt_se->run_list); -} -#endif +DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); void __init sched_init(void) { @@ -8249,9 +6868,17 @@ void __init sched_init(void) #ifdef CONFIG_CGROUP_SCHED list_add(&root_task_group.list, &task_groups); INIT_LIST_HEAD(&root_task_group.children); + INIT_LIST_HEAD(&root_task_group.siblings); autogroup_init(&init_task); + #endif /* CONFIG_CGROUP_SCHED */ +#ifdef CONFIG_CGROUP_CPUACCT + root_cpuacct.cpustat = &kernel_cpustat; + root_cpuacct.cpuusage = alloc_percpu(u64); + /* Too early, not expected to fail */ + BUG_ON(!root_cpuacct.cpuusage); +#endif for_each_possible_cpu(i) { struct rq *rq; @@ -8263,7 +6890,7 @@ void __init sched_init(void) init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.shares = root_task_group_load; + root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); /* * How much cpu bandwidth does root_task_group get? @@ -8313,7 +6940,7 @@ void __init sched_init(void) rq->avg_idle = 2*sysctl_sched_migration_cost; rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ - rq->nohz_balance_kick = 0; + rq->nohz_flags = 0; #endif #endif init_rq_hrtick(rq); @@ -8326,10 +6953,6 @@ void __init sched_init(void) INIT_HLIST_HEAD(&init_task.preempt_notifiers); #endif -#ifdef CONFIG_SMP - open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); -#endif - #ifdef CONFIG_RT_MUTEXES plist_head_init(&init_task.pi_waiters); #endif @@ -8357,17 +6980,11 @@ void __init sched_init(void) #ifdef CONFIG_SMP zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); -#ifdef CONFIG_NO_HZ - zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); - alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); - atomic_set(&nohz.load_balancer, nr_cpu_ids); - atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); - atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); -#endif /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); -#endif /* SMP */ +#endif + init_sched_fair_class(); scheduler_running = 1; } @@ -8519,169 +7136,14 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -#ifdef CONFIG_FAIR_GROUP_SCHED -static void free_fair_sched_group(struct task_group *tg) -{ - int i; - - destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); - - for_each_possible_cpu(i) { - if (tg->cfs_rq) - kfree(tg->cfs_rq[i]); - if (tg->se) - kfree(tg->se[i]); - } - - kfree(tg->cfs_rq); - kfree(tg->se); -} - -static -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - struct cfs_rq *cfs_rq; - struct sched_entity *se; - int i; - - tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); - if (!tg->cfs_rq) - goto err; - tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); - if (!tg->se) - goto err; - - tg->shares = NICE_0_LOAD; - - init_cfs_bandwidth(tg_cfs_bandwidth(tg)); - - for_each_possible_cpu(i) { - cfs_rq = kzalloc_node(sizeof(struct cfs_rq), - GFP_KERNEL, cpu_to_node(i)); - if (!cfs_rq) - goto err; - - se = kzalloc_node(sizeof(struct sched_entity), - GFP_KERNEL, cpu_to_node(i)); - if (!se) - goto err_free_rq; - - init_cfs_rq(cfs_rq); - init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); - } - - return 1; - -err_free_rq: - kfree(cfs_rq); -err: - return 0; -} - -static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long flags; - - /* - * Only empty task groups can be destroyed; so we can speculatively - * check on_list without danger of it being re-added. - */ - if (!tg->cfs_rq[cpu]->on_list) - return; - - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); -} -#else /* !CONFIG_FAIR_GROUP_SCHED */ -static inline void free_fair_sched_group(struct task_group *tg) -{ -} - -static inline -int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} - -static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) -{ -} -#endif /* CONFIG_FAIR_GROUP_SCHED */ - #ifdef CONFIG_RT_GROUP_SCHED -static void free_rt_sched_group(struct task_group *tg) -{ - int i; - - if (tg->rt_se) - destroy_rt_bandwidth(&tg->rt_bandwidth); - - for_each_possible_cpu(i) { - if (tg->rt_rq) - kfree(tg->rt_rq[i]); - if (tg->rt_se) - kfree(tg->rt_se[i]); - } - - kfree(tg->rt_rq); - kfree(tg->rt_se); -} - -static -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) -{ - struct rt_rq *rt_rq; - struct sched_rt_entity *rt_se; - int i; - - tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); - if (!tg->rt_rq) - goto err; - tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); - if (!tg->rt_se) - goto err; - - init_rt_bandwidth(&tg->rt_bandwidth, - ktime_to_ns(def_rt_bandwidth.rt_period), 0); - - for_each_possible_cpu(i) { - rt_rq = kzalloc_node(sizeof(struct rt_rq), - GFP_KERNEL, cpu_to_node(i)); - if (!rt_rq) - goto err; - - rt_se = kzalloc_node(sizeof(struct sched_rt_entity), - GFP_KERNEL, cpu_to_node(i)); - if (!rt_se) - goto err_free_rq; - - init_rt_rq(rt_rq, cpu_rq(i)); - rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; - init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); - } - - return 1; - -err_free_rq: - kfree(rt_rq); -err: - return 0; -} #else /* !CONFIG_RT_GROUP_SCHED */ -static inline void free_rt_sched_group(struct task_group *tg) -{ -} - -static inline -int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) -{ - return 1; -} #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED +/* task_group_lock serializes the addition/removal of task groups */ +static DEFINE_SPINLOCK(task_group_lock); + static void free_sched_group(struct task_group *tg) { free_fair_sched_group(tg); @@ -8787,47 +7249,6 @@ void sched_move_task(struct task_struct *tsk) #endif /* CONFIG_CGROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED -static DEFINE_MUTEX(shares_mutex); - -int sched_group_set_shares(struct task_group *tg, unsigned long shares) -{ - int i; - unsigned long flags; - - /* - * We can't change the weight of the root cgroup. - */ - if (!tg->se[0]) - return -EINVAL; - - shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); - - mutex_lock(&shares_mutex); - if (tg->shares == shares) - goto done; - - tg->shares = shares; - for_each_possible_cpu(i) { - struct rq *rq = cpu_rq(i); - struct sched_entity *se; - - se = tg->se[i]; - /* Propagate contribution to hierarchy */ - raw_spin_lock_irqsave(&rq->lock, flags); - for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se)); - raw_spin_unlock_irqrestore(&rq->lock, flags); - } - -done: - mutex_unlock(&shares_mutex); - return 0; -} - -unsigned long sched_group_shares(struct task_group *tg) -{ - return tg->shares; -} #endif #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH) @@ -8852,7 +7273,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg) struct task_struct *g, *p; do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + if (rt_task(p) && task_rq(p)->rt.tg == tg) return 1; } while_each_thread(g, p); @@ -9203,8 +7624,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime); static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) { - int i, ret = 0, runtime_enabled; - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + int i, ret = 0, runtime_enabled, runtime_was_enabled; + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; if (tg == &root_task_group) return -EINVAL; @@ -9231,6 +7652,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) goto out_unlock; runtime_enabled = quota != RUNTIME_INF; + runtime_was_enabled = cfs_b->quota != RUNTIME_INF; + account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); raw_spin_lock_irq(&cfs_b->lock); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; @@ -9246,13 +7669,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) for_each_possible_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; - struct rq *rq = rq_of(cfs_rq); + struct rq *rq = cfs_rq->rq; raw_spin_lock_irq(&rq->lock); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); raw_spin_unlock_irq(&rq->lock); } @@ -9266,7 +7689,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) { u64 quota, period; - period = ktime_to_ns(tg_cfs_bandwidth(tg)->period); + period = ktime_to_ns(tg->cfs_bandwidth.period); if (cfs_quota_us < 0) quota = RUNTIME_INF; else @@ -9279,10 +7702,10 @@ long tg_get_cfs_quota(struct task_group *tg) { u64 quota_us; - if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF) + if (tg->cfs_bandwidth.quota == RUNTIME_INF) return -1; - quota_us = tg_cfs_bandwidth(tg)->quota; + quota_us = tg->cfs_bandwidth.quota; do_div(quota_us, NSEC_PER_USEC); return quota_us; @@ -9293,10 +7716,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) u64 quota, period; period = (u64)cfs_period_us * NSEC_PER_USEC; - quota = tg_cfs_bandwidth(tg)->quota; - - if (period <= 0) - return -EINVAL; + quota = tg->cfs_bandwidth.quota; return tg_set_cfs_bandwidth(tg, period, quota); } @@ -9305,7 +7725,7 @@ long tg_get_cfs_period(struct task_group *tg) { u64 cfs_period_us; - cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period); + cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period); do_div(cfs_period_us, NSEC_PER_USEC); return cfs_period_us; @@ -9365,13 +7785,13 @@ static u64 normalize_cfs_quota(struct task_group *tg, static int tg_cfs_schedulable_down(struct task_group *tg, void *data) { struct cfs_schedulable_data *d = data; - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; s64 quota = 0, parent_quota = -1; if (!tg->parent) { quota = RUNTIME_INF; } else { - struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent); + struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; quota = normalize_cfs_quota(tg, d); parent_quota = parent_b->hierarchal_quota; @@ -9415,7 +7835,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, struct cgroup_map_cb *cb) { struct task_group *tg = cgroup_tg(cgrp); - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; cb->fill(cb, "nr_periods", cfs_b->nr_periods); cb->fill(cb, "nr_throttled", cfs_b->nr_throttled); @@ -9516,38 +7936,16 @@ struct cgroup_subsys cpu_cgroup_subsys = { * (balbir@in.ibm.com). */ -/* track cpu usage of a group of tasks and its child groups */ -struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; - struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; - struct cpuacct *parent; -}; - -struct cgroup_subsys cpuacct_subsys; - -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); -} - -/* return cpu accounting group to which this task belongs */ -static inline struct cpuacct *task_ca(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} - /* create a new cpu accounting group */ static struct cgroup_subsys_state *cpuacct_create( struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); - int i; + struct cpuacct *ca; + if (!cgrp->parent) + return &root_cpuacct.css; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) goto out; @@ -9555,18 +7953,13 @@ static struct cgroup_subsys_state *cpuacct_create( if (!ca->cpuusage) goto out_free_ca; - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - if (percpu_counter_init(&ca->cpustat[i], 0)) - goto out_free_counters; - - if (cgrp->parent) - ca->parent = cgroup_ca(cgrp->parent); + ca->cpustat = alloc_percpu(struct kernel_cpustat); + if (!ca->cpustat) + goto out_free_cpuusage; return &ca->css; -out_free_counters: - while (--i >= 0) - percpu_counter_destroy(&ca->cpustat[i]); +out_free_cpuusage: free_percpu(ca->cpuusage); out_free_ca: kfree(ca); @@ -9579,10 +7972,8 @@ static void cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); - int i; - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) - percpu_counter_destroy(&ca->cpustat[i]); + free_percpu(ca->cpustat); free_percpu(ca->cpuusage); kfree(ca); } @@ -9675,16 +8066,31 @@ static const char *cpuacct_stat_desc[] = { }; static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) + struct cgroup_map_cb *cb) { struct cpuacct *ca = cgroup_ca(cgrp); - int i; + int cpu; + s64 val = 0; + + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_USER]; + val += kcpustat->cpustat[CPUTIME_NICE]; + } + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); - for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { - s64 val = percpu_counter_read(&ca->cpustat[i]); - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[i], val); + val = 0; + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_SYSTEM]; + val += kcpustat->cpustat[CPUTIME_IRQ]; + val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; } + + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + return 0; } @@ -9714,7 +8120,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) * * called with rq->lock held. */ -static void cpuacct_charge(struct task_struct *tsk, u64 cputime) +void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; int cpu; @@ -9728,7 +8134,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) ca = task_ca(tsk); - for (; ca; ca = ca->parent) { + for (; ca; ca = parent_ca(ca)) { u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } @@ -9736,45 +8142,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_unlock(); } -/* - * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large - * in cputime_t units. As a result, cpuacct_update_stats calls - * percpu_counter_add with values large enough to always overflow the - * per cpu batch limit causing bad SMP scalability. - * - * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we - * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled - * and enabled. We cap it at INT_MAX which is the largest allowed batch value. - */ -#ifdef CONFIG_SMP -#define CPUACCT_BATCH \ - min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) -#else -#define CPUACCT_BATCH 0 -#endif - -/* - * Charge the system/user time to the task's accounting group. - */ -static void cpuacct_update_stats(struct task_struct *tsk, - enum cpuacct_stat_index idx, cputime_t val) -{ - struct cpuacct *ca; - int batch = CPUACCT_BATCH; - - if (unlikely(!cpuacct_subsys.active)) - return; - - rcu_read_lock(); - ca = task_ca(tsk); - - do { - __percpu_counter_add(&ca->cpustat[idx], val, batch); - ca = ca->parent; - } while (ca); - rcu_read_unlock(); -} - struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", .create = cpuacct_create, diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c index a86cf9d9eb1..b0d798eaf13 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched/cpupri.c @@ -1,5 +1,5 @@ /* - * kernel/sched_cpupri.c + * kernel/sched/cpupri.c * * CPU priority management * @@ -28,7 +28,7 @@ */ #include <linux/gfp.h> -#include "sched_cpupri.h" +#include "cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ static int convert_prio(int prio) diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h index f6d75617349..f6d75617349 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched/cpupri.h diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c index a6710a112b4..2a075e10004 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched/debug.c @@ -1,5 +1,5 @@ /* - * kernel/time/sched_debug.c + * kernel/sched/debug.c * * Print the CFS rbtree * @@ -16,6 +16,8 @@ #include <linux/kallsyms.h> #include <linux/utsname.h> +#include "sched.h" + static DEFINE_SPINLOCK(sched_debug_lock); /* @@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v) return 0; } -static void sysrq_sched_debug_show(void) +void sysrq_sched_debug_show(void) { sched_debug_show(NULL, NULL); } diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c index 8a39fa3e3c6..8e42de9105f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched/fair.c @@ -23,6 +23,13 @@ #include <linux/latencytop.h> #include <linux/sched.h> #include <linux/cpumask.h> +#include <linux/slab.h> +#include <linux/profile.h> +#include <linux/interrupt.h> + +#include <trace/events/sched.h> + +#include "sched.h" /* * Targeted preemption latency for CPU-bound tasks: @@ -103,7 +110,110 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif -static const struct sched_class fair_sched_class; +/* + * Increase the granularity value when there are more CPUs, + * because with more CPUs the 'effective latency' as visible + * to users decreases. But the relationship is not linear, + * so pick a second-best guess by going with the log2 of the + * number of CPUs. + * + * This idea comes from the SD scheduler of Con Kolivas: + */ +static int get_update_sysctl_factor(void) +{ + unsigned int cpus = min_t(int, num_online_cpus(), 8); + unsigned int factor; + + switch (sysctl_sched_tunable_scaling) { + case SCHED_TUNABLESCALING_NONE: + factor = 1; + break; + case SCHED_TUNABLESCALING_LINEAR: + factor = cpus; + break; + case SCHED_TUNABLESCALING_LOG: + default: + factor = 1 + ilog2(cpus); + break; + } + + return factor; +} + +static void update_sysctl(void) +{ + unsigned int factor = get_update_sysctl_factor(); + +#define SET_SYSCTL(name) \ + (sysctl_##name = (factor) * normalized_sysctl_##name) + SET_SYSCTL(sched_min_granularity); + SET_SYSCTL(sched_latency); + SET_SYSCTL(sched_wakeup_granularity); +#undef SET_SYSCTL +} + +void sched_init_granularity(void) +{ + update_sysctl(); +} + +#if BITS_PER_LONG == 32 +# define WMULT_CONST (~0UL) +#else +# define WMULT_CONST (1UL << 32) +#endif + +#define WMULT_SHIFT 32 + +/* + * Shift right and round: + */ +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) + +/* + * delta *= weight / lw + */ +static unsigned long +calc_delta_mine(unsigned long delta_exec, unsigned long weight, + struct load_weight *lw) +{ + u64 tmp; + + /* + * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched + * entities since MIN_SHARES = 2. Treat weight as 1 if less than + * 2^SCHED_LOAD_RESOLUTION. + */ + if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) + tmp = (u64)delta_exec * scale_load_down(weight); + else + tmp = (u64)delta_exec; + + if (!lw->inv_weight) { + unsigned long w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) + lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; + else + lw->inv_weight = WMULT_CONST / w; + } + + /* + * Check whether we'd overflow the 64-bit multiplication: + */ + if (unlikely(tmp > WMULT_CONST)) + tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, + WMULT_SHIFT/2); + else + tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); + + return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); +} + + +const struct sched_class fair_sched_class; /************************************************************** * CFS operations on generic schedulable entities: @@ -413,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } -static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) +struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = cfs_rq->rb_leftmost; @@ -434,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) } #ifdef CONFIG_SCHED_DEBUG -static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); @@ -684,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) - inc_cpu_load(rq_of(cfs_rq), se->load.weight); + update_load_add(&rq_of(cfs_rq)->load, se->load.weight); if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, se->load.weight); list_add(&se->group_node, &cfs_rq->tasks); @@ -697,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) - dec_cpu_load(rq_of(cfs_rq), se->load.weight); + update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); if (entity_is_task(se)) { add_cfs_task_weight(cfs_rq, -se->load.weight); list_del_init(&se->group_node); @@ -893,7 +1003,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.sleep_max)) se->statistics.sleep_max = delta; - se->statistics.sleep_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { @@ -910,7 +1019,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.block_max)) se->statistics.block_max = delta; - se->statistics.block_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { @@ -920,6 +1028,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) trace_sched_stat_iowait(tsk, delta); } + trace_sched_stat_blocked(tsk, delta); + /* * Blocking time is in units of nanosecs, so shift by * 20 to get a milliseconds-range estimation of the @@ -1287,6 +1397,32 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) */ #ifdef CONFIG_CFS_BANDWIDTH + +#ifdef HAVE_JUMP_LABEL +static struct jump_label_key __cfs_bandwidth_used; + +static inline bool cfs_bandwidth_used(void) +{ + return static_branch(&__cfs_bandwidth_used); +} + +void account_cfs_bandwidth_used(int enabled, int was_enabled) +{ + /* only need to count groups transitioning between enabled/!enabled */ + if (enabled && !was_enabled) + jump_label_inc(&__cfs_bandwidth_used); + else if (!enabled && was_enabled) + jump_label_dec(&__cfs_bandwidth_used); +} +#else /* HAVE_JUMP_LABEL */ +static bool cfs_bandwidth_used(void) +{ + return true; +} + +void account_cfs_bandwidth_used(int enabled, int was_enabled) {} +#endif /* HAVE_JUMP_LABEL */ + /* * default period for cfs group bandwidth. * default: 0.1s, units: nanoseconds @@ -1308,7 +1444,7 @@ static inline u64 sched_cfs_bandwidth_slice(void) * * requires cfs_b->lock */ -static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) +void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { u64 now; @@ -1320,6 +1456,11 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); } +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return &tg->cfs_bandwidth; +} + /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -1421,7 +1562,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { - if (!cfs_rq->runtime_enabled) + if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) return; __account_cfs_rq_runtime(cfs_rq, delta_exec); @@ -1429,13 +1570,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { - return cfs_rq->throttled; + return cfs_bandwidth_used() && cfs_rq->throttled; } /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { - return cfs_rq->throttle_count; + return cfs_bandwidth_used() && cfs_rq->throttle_count; } /* @@ -1530,7 +1671,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) raw_spin_unlock(&cfs_b->lock); } -static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) +void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); @@ -1756,6 +1897,9 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) { + if (!cfs_bandwidth_used()) + return; + if (!cfs_rq->runtime_enabled || cfs_rq->nr_running) return; @@ -1801,6 +1945,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { + if (!cfs_bandwidth_used()) + return; + /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -1818,6 +1965,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) /* conditionally throttle active cfs_rq's from put_prev_entity() */ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { + if (!cfs_bandwidth_used()) + return; + if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) return; @@ -1830,7 +1980,112 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) throttle_cfs_rq(cfs_rq); } -#else + +static inline u64 default_cfs_period(void); +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun); +static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b); + +static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, slack_timer); + do_sched_cfs_slack_timer(cfs_b); + + return HRTIMER_NORESTART; +} + +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, cfs_b->period); + + if (!overrun) + break; + + idle = do_sched_cfs_period_timer(cfs_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + raw_spin_lock_init(&cfs_b->lock); + cfs_b->runtime = 0; + cfs_b->quota = RUNTIME_INF; + cfs_b->period = ns_to_ktime(default_cfs_period()); + + INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->period_timer.function = sched_cfs_period_timer; + hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->slack_timer.function = sched_cfs_slack_timer; +} + +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) +{ + cfs_rq->runtime_enabled = 0; + INIT_LIST_HEAD(&cfs_rq->throttled_list); +} + +/* requires cfs_b->lock, may release to reprogram timer */ +void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + /* + * The timer may be active because we're trying to set a new bandwidth + * period or because we're racing with the tear-down path + * (timer_active==0 becomes visible before the hrtimer call-back + * terminates). In either case we ensure that it's re-programmed + */ + while (unlikely(hrtimer_active(&cfs_b->period_timer))) { + raw_spin_unlock(&cfs_b->lock); + /* ensure cfs_b->lock is available while we wait */ + hrtimer_cancel(&cfs_b->period_timer); + + raw_spin_lock(&cfs_b->lock); + /* if someone else restarted the timer then we're done */ + if (cfs_b->timer_active) + return; + } + + cfs_b->timer_active = 1; + start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); +} + +static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) +{ + hrtimer_cancel(&cfs_b->period_timer); + hrtimer_cancel(&cfs_b->slack_timer); +} + +void unthrottle_offline_cfs_rqs(struct rq *rq) +{ + struct cfs_rq *cfs_rq; + + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + + if (!cfs_rq->runtime_enabled) + continue; + + /* + * clock_task is not advancing so we just need to make sure + * there's some valid quota amount + */ + cfs_rq->runtime_remaining = cfs_b->quota; + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); + } +} + +#else /* CONFIG_CFS_BANDWIDTH */ static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -1852,8 +2107,22 @@ static inline int throttled_lb_pair(struct task_group *tg, { return 0; } + +void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} #endif +static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) +{ + return NULL; +} +static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} +void unthrottle_offline_cfs_rqs(struct rq *rq) {} + +#endif /* CONFIG_CFS_BANDWIDTH */ + /************************************************** * CFS operations on tasks: */ @@ -1866,7 +2135,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) WARN_ON(task_rq(p) != rq); - if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { + if (cfs_rq->nr_running > 1) { u64 slice = sched_slice(cfs_rq, se); u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; s64 delta = slice - ran; @@ -1897,7 +2166,7 @@ static void hrtick_update(struct rq *rq) { struct task_struct *curr = rq->curr; - if (curr->sched_class != &fair_sched_class) + if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) return; if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) @@ -2020,6 +2289,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) } #ifdef CONFIG_SMP +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + +/* + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static unsigned long source_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return min(rq->cpu_load[type-1], total); +} + +/* + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. + */ +static unsigned long target_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return max(rq->cpu_load[type-1], total); +} + +static unsigned long power_of(int cpu) +{ + return cpu_rq(cpu)->cpu_power; +} + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + + if (nr_running) + return rq->load.weight / nr_running; + + return 0; +} + static void task_waking_fair(struct task_struct *p) { @@ -2327,7 +2651,7 @@ static int select_idle_sibling(struct task_struct *p, int target) int prev_cpu = task_cpu(p); struct sched_domain *sd; struct sched_group *sg; - int i, smt = 0; + int i; /* * If the task is going to be woken-up on this cpu and if it is @@ -2347,17 +2671,9 @@ static int select_idle_sibling(struct task_struct *p, int target) * Otherwise, iterate the domains and find an elegible idle cpu. */ rcu_read_lock(); -again: - for_each_domain(target, sd) { - if (!smt && (sd->flags & SD_SHARE_CPUPOWER)) - continue; - - if (smt && !(sd->flags & SD_SHARE_CPUPOWER)) - break; - - if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) - break; + sd = rcu_dereference(per_cpu(sd_llc, target)); + for_each_lower_domain(sd) { sg = sd->groups; do { if (!cpumask_intersects(sched_group_cpus(sg), @@ -2376,10 +2692,6 @@ next: sg = sg->next; } while (sg != sd->groups); } - if (!smt) { - smt = 1; - goto again; - } done: rcu_read_unlock(); @@ -2408,6 +2720,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) int want_sd = 1; int sync = wake_flags & WF_SYNC; + if (p->rt.nr_cpus_allowed == 1) + return prev_cpu; + if (sd_flag & SD_BALANCE_WAKE) { if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) want_affine = 1; @@ -2692,7 +3007,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) } while (cfs_rq); p = task_of(se); - hrtick_start_fair(rq, p); + if (hrtick_enabled(rq)) + hrtick_start_fair(rq, p); return p; } @@ -2736,6 +3052,12 @@ static void yield_task_fair(struct rq *rq) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() + * and double the fastpath cost. + */ + rq->skip_clock_update = 1; } set_skip_buddy(se); @@ -2776,12 +3098,48 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, } /* + * Is this task likely cache-hot: + */ +static int +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +{ + s64 delta; + + if (p->sched_class != &fair_sched_class) + return 0; + + if (unlikely(p->policy == SCHED_IDLE)) + return 0; + + /* + * Buddy candidates are cache hot: + */ + if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && + (&p->se == cfs_rq_of(&p->se)->next || + &p->se == cfs_rq_of(&p->se)->last)) + return 1; + + if (sysctl_sched_migration_cost == -1) + return 1; + if (sysctl_sched_migration_cost == 0) + return 0; + + delta = now - p->se.exec_start; + + return delta < (s64)sysctl_sched_migration_cost; +} + +#define LBF_ALL_PINNED 0x01 +#define LBF_NEED_BREAK 0x02 +#define LBF_ABORT 0x04 + +/* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) + int *lb_flags) { int tsk_cache_hot = 0; /* @@ -2794,7 +3152,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, schedstat_inc(p, se.statistics.nr_failed_migrations_affine); return 0; } - *all_pinned = 0; + *lb_flags &= ~LBF_ALL_PINNED; if (task_running(rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); @@ -2868,7 +3226,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, static unsigned long balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *all_pinned, + enum cpu_idle_type idle, int *lb_flags, struct cfs_rq *busiest_cfs_rq) { int loops = 0, pulled = 0; @@ -2879,12 +3237,14 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, goto out; list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { - if (loops++ > sysctl_sched_nr_migrate) + if (loops++ > sysctl_sched_nr_migrate) { + *lb_flags |= LBF_NEED_BREAK; break; + } if ((p->se.load.weight >> 1) > rem_load_move || !can_migrate_task(p, busiest, this_cpu, sd, idle, - all_pinned)) + lb_flags)) continue; pull_task(busiest, p, this_rq, this_cpu); @@ -2897,8 +3257,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (idle == CPU_NEWLY_IDLE) + if (idle == CPU_NEWLY_IDLE) { + *lb_flags |= LBF_ABORT; break; + } #endif /* @@ -3003,7 +3365,7 @@ static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) + int *lb_flags) { long rem_load_move = max_load_move; struct cfs_rq *busiest_cfs_rq; @@ -3016,6 +3378,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long busiest_weight = busiest_cfs_rq->load.weight; u64 rem_load, moved_load; + if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) + break; + /* * empty group or part of a throttled hierarchy */ @@ -3027,7 +3392,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, rem_load = div_u64(rem_load, busiest_h_load + 1); moved_load = balance_tasks(this_rq, this_cpu, busiest, - rem_load, sd, idle, all_pinned, + rem_load, sd, idle, lb_flags, busiest_cfs_rq); if (!moved_load) @@ -3053,10 +3418,10 @@ static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) + int *lb_flags) { return balance_tasks(this_rq, this_cpu, busiest, - max_load_move, sd, idle, all_pinned, + max_load_move, sd, idle, lb_flags, &busiest->cfs); } #endif @@ -3071,29 +3436,30 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, - int *all_pinned) + int *lb_flags) { unsigned long total_load_moved = 0, load_moved; do { load_moved = load_balance_fair(this_rq, this_cpu, busiest, max_load_move - total_load_moved, - sd, idle, all_pinned); + sd, idle, lb_flags); total_load_moved += load_moved; + if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) + break; + #ifdef CONFIG_PREEMPT /* * NEWIDLE balancing is a source of latency, so preemptible * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) - break; - - if (raw_spin_is_contended(&this_rq->lock) || - raw_spin_is_contended(&busiest->lock)) + if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { + *lb_flags |= LBF_ABORT; break; + } #endif } while (load_moved && max_load_move > total_load_moved); @@ -3155,15 +3521,6 @@ struct sg_lb_stats { }; /** - * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. - * @group: The group whose first cpu is to be returned. - */ -static inline unsigned int group_first_cpu(struct sched_group *group) -{ - return cpumask_first(sched_group_cpus(group)); -} - -/** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. * @idle: The Idle status of the CPU for whose sd load_icx is obtained. @@ -3412,7 +3769,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) sdg->sgp->power = power; } -static void update_group_power(struct sched_domain *sd, int cpu) +void update_group_power(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; @@ -3678,11 +4035,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, } while (sg != sd->groups); } -int __weak arch_sd_sibling_asym_packing(void) -{ - return 0*SD_ASYM_PACKING; -} - /** * check_asym_packing - Check to see if the group is packed into the * sched doman. @@ -4046,7 +4398,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, #define MAX_PINNED_INTERVAL 512 /* Working cpumask for load_balance and load_balance_newidle. */ -static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); static int need_active_balance(struct sched_domain *sd, int idle, int busiest_cpu, int this_cpu) @@ -4097,7 +4449,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, all_pinned = 0, active_balance = 0; + int ld_moved, lb_flags = 0, active_balance = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; @@ -4138,11 +4490,11 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - all_pinned = 1; + lb_flags |= LBF_ALL_PINNED; local_irq_save(flags); double_rq_lock(this_rq, busiest); ld_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, &all_pinned); + imbalance, sd, idle, &lb_flags); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -4152,8 +4504,16 @@ redo: if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); + if (lb_flags & LBF_ABORT) + goto out_balanced; + + if (lb_flags & LBF_NEED_BREAK) { + lb_flags &= ~LBF_NEED_BREAK; + goto redo; + } + /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(all_pinned)) { + if (unlikely(lb_flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); if (!cpumask_empty(cpus)) goto redo; @@ -4183,7 +4543,7 @@ redo: tsk_cpus_allowed(busiest->curr))) { raw_spin_unlock_irqrestore(&busiest->lock, flags); - all_pinned = 1; + lb_flags |= LBF_ALL_PINNED; goto out_one_pinned; } @@ -4236,7 +4596,8 @@ out_balanced: out_one_pinned: /* tune up the balancing interval */ - if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || + if (((lb_flags & LBF_ALL_PINNED) && + sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; @@ -4249,7 +4610,7 @@ out: * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -static void idle_balance(int this_cpu, struct rq *this_rq) +void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; int pulled_task = 0; @@ -4364,28 +4725,16 @@ out_unlock: #ifdef CONFIG_NO_HZ /* * idle load balancing details - * - One of the idle CPUs nominates itself as idle load_balancer, while - * entering idle. - * - This idle load balancer CPU will also go into tickless mode when - * it is idle, just like all other idle CPUs * - When one of the busy CPUs notice that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. */ static struct { - atomic_t load_balancer; - atomic_t first_pick_cpu; - atomic_t second_pick_cpu; cpumask_var_t idle_cpus_mask; - cpumask_var_t grp_idle_mask; + atomic_t nr_cpus; unsigned long next_balance; /* in jiffy units */ } nohz ____cacheline_aligned; -int get_nohz_load_balancer(void) -{ - return atomic_read(&nohz.load_balancer); -} - #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) /** * lowest_flag_domain - Return lowest sched_domain containing flag. @@ -4422,33 +4771,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) (sd && (sd->flags & flag)); sd = sd->parent) /** - * is_semi_idle_group - Checks if the given sched_group is semi-idle. - * @ilb_group: group to be checked for semi-idleness - * - * Returns: 1 if the group is semi-idle. 0 otherwise. - * - * We define a sched_group to be semi idle if it has atleast one idle-CPU - * and atleast one non-idle CPU. This helper function checks if the given - * sched_group is semi-idle or not. - */ -static inline int is_semi_idle_group(struct sched_group *ilb_group) -{ - cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, - sched_group_cpus(ilb_group)); - - /* - * A sched_group is semi-idle when it has atleast one busy cpu - * and atleast one idle cpu. - */ - if (cpumask_empty(nohz.grp_idle_mask)) - return 0; - - if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) - return 0; - - return 1; -} -/** * find_new_ilb - Finds the optimum idle load balancer for nomination. * @cpu: The cpu which is nominating a new idle_load_balancer. * @@ -4462,9 +4784,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group) */ static int find_new_ilb(int cpu) { + int ilb = cpumask_first(nohz.idle_cpus_mask); + struct sched_group *ilbg; struct sched_domain *sd; - struct sched_group *ilb_group; - int ilb = nr_cpu_ids; /* * Have idle load balancer selection from semi-idle packages only @@ -4482,23 +4804,28 @@ static int find_new_ilb(int cpu) rcu_read_lock(); for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { - ilb_group = sd->groups; + ilbg = sd->groups; do { - if (is_semi_idle_group(ilb_group)) { - ilb = cpumask_first(nohz.grp_idle_mask); + if (ilbg->group_weight != + atomic_read(&ilbg->sgp->nr_busy_cpus)) { + ilb = cpumask_first_and(nohz.idle_cpus_mask, + sched_group_cpus(ilbg)); goto unlock; } - ilb_group = ilb_group->next; + ilbg = ilbg->next; - } while (ilb_group != sd->groups); + } while (ilbg != sd->groups); } unlock: rcu_read_unlock(); out_done: - return ilb; + if (ilb < nr_cpu_ids && idle_cpu(ilb)) + return ilb; + + return nr_cpu_ids; } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) @@ -4518,99 +4845,68 @@ static void nohz_balancer_kick(int cpu) nohz.next_balance++; - ilb_cpu = get_nohz_load_balancer(); - - if (ilb_cpu >= nr_cpu_ids) { - ilb_cpu = cpumask_first(nohz.idle_cpus_mask); - if (ilb_cpu >= nr_cpu_ids) - return; - } + ilb_cpu = find_new_ilb(cpu); - if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { - cpu_rq(ilb_cpu)->nohz_balance_kick = 1; + if (ilb_cpu >= nr_cpu_ids) + return; - smp_mb(); - /* - * Use smp_send_reschedule() instead of resched_cpu(). - * This way we generate a sched IPI on the target cpu which - * is idle. And the softirq performing nohz idle load balance - * will be run before returning from the IPI. - */ - smp_send_reschedule(ilb_cpu); - } + if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu))) + return; + /* + * Use smp_send_reschedule() instead of resched_cpu(). + * This way we generate a sched IPI on the target cpu which + * is idle. And the softirq performing nohz idle load balance + * will be run before returning from the IPI. + */ + smp_send_reschedule(ilb_cpu); return; } -/* - * This routine will try to nominate the ilb (idle load balancing) - * owner among the cpus whose ticks are stopped. ilb owner will do the idle - * load balancing on behalf of all those cpus. - * - * When the ilb owner becomes busy, we will not have new ilb owner until some - * idle CPU wakes up and goes back to idle or some busy CPU tries to kick - * idle load balancing by kicking one of the idle CPUs. - * - * Ticks are stopped for the ilb owner as well, with busy CPU kicking this - * ilb owner CPU in future (when there is a need for idle load balancing on - * behalf of all idle CPUs). - */ -void select_nohz_load_balancer(int stop_tick) +static inline void set_cpu_sd_state_busy(void) { + struct sched_domain *sd; int cpu = smp_processor_id(); - if (stop_tick) { - if (!cpu_active(cpu)) { - if (atomic_read(&nohz.load_balancer) != cpu) - return; - - /* - * If we are going offline and still the leader, - * give up! - */ - if (atomic_cmpxchg(&nohz.load_balancer, cpu, - nr_cpu_ids) != cpu) - BUG(); + if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) + return; + clear_bit(NOHZ_IDLE, nohz_flags(cpu)); - return; - } + rcu_read_lock(); + for_each_domain(cpu, sd) + atomic_inc(&sd->groups->sgp->nr_busy_cpus); + rcu_read_unlock(); +} - cpumask_set_cpu(cpu, nohz.idle_cpus_mask); +void set_cpu_sd_state_idle(void) +{ + struct sched_domain *sd; + int cpu = smp_processor_id(); - if (atomic_read(&nohz.first_pick_cpu) == cpu) - atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); - if (atomic_read(&nohz.second_pick_cpu) == cpu) - atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); + if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) + return; + set_bit(NOHZ_IDLE, nohz_flags(cpu)); - if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { - int new_ilb; + rcu_read_lock(); + for_each_domain(cpu, sd) + atomic_dec(&sd->groups->sgp->nr_busy_cpus); + rcu_read_unlock(); +} - /* make me the ilb owner */ - if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, - cpu) != nr_cpu_ids) - return; +/* + * This routine will record that this cpu is going idle with tick stopped. + * This info will be used in performing idle load balancing in the future. + */ +void select_nohz_load_balancer(int stop_tick) +{ + int cpu = smp_processor_id(); - /* - * Check to see if there is a more power-efficient - * ilb. - */ - new_ilb = find_new_ilb(cpu); - if (new_ilb < nr_cpu_ids && new_ilb != cpu) { - atomic_set(&nohz.load_balancer, nr_cpu_ids); - resched_cpu(new_ilb); - return; - } - return; - } - } else { - if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + if (stop_tick) { + if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) return; - cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); - - if (atomic_read(&nohz.load_balancer) == cpu) - if (atomic_cmpxchg(&nohz.load_balancer, cpu, - nr_cpu_ids) != cpu) - BUG(); + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); + atomic_inc(&nohz.nr_cpus); + set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); } return; } @@ -4624,7 +4920,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; * Scale the max load_balance interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ -static void update_max_interval(void) +void update_max_interval(void) { max_load_balance_interval = HZ*num_online_cpus()/10; } @@ -4716,11 +5012,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) struct rq *rq; int balance_cpu; - if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) - return; + if (idle != CPU_IDLE || + !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu))) + goto end; for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { - if (balance_cpu == this_cpu) + if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) continue; /* @@ -4728,10 +5025,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) * work being done for other cpus. Next load * balancing owner will pick it up. */ - if (need_resched()) { - this_rq->nohz_balance_kick = 0; + if (need_resched()) break; - } raw_spin_lock_irq(&this_rq->lock); update_rq_clock(this_rq); @@ -4745,53 +5040,75 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) this_rq->next_balance = rq->next_balance; } nohz.next_balance = this_rq->next_balance; - this_rq->nohz_balance_kick = 0; +end: + clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)); } /* - * Current heuristic for kicking the idle load balancer - * - first_pick_cpu is the one of the busy CPUs. It will kick - * idle load balancer when it has more than one process active. This - * eliminates the need for idle load balancing altogether when we have - * only one running process in the system (common case). - * - If there are more than one busy CPU, idle load balancer may have - * to run for active_load_balance to happen (i.e., two busy CPUs are - * SMT or core siblings and can run better if they move to different - * physical CPUs). So, second_pick_cpu is the second of the busy CPUs - * which will kick idle load balancer as soon as it has any load. + * Current heuristic for kicking the idle load balancer in the presence + * of an idle cpu is the system. + * - This rq has more than one task. + * - At any scheduler domain level, this cpu's scheduler group has multiple + * busy cpu's exceeding the group's power. + * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler + * domain span are idle. */ static inline int nohz_kick_needed(struct rq *rq, int cpu) { unsigned long now = jiffies; - int ret; - int first_pick_cpu, second_pick_cpu; + struct sched_domain *sd; - if (time_before(now, nohz.next_balance)) + if (unlikely(idle_cpu(cpu))) return 0; - if (idle_cpu(cpu)) - return 0; + /* + * We may be recently in ticked or tickless idle mode. At the first + * busy tick after returning from idle, we will update the busy stats. + */ + set_cpu_sd_state_busy(); + if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { + clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + atomic_dec(&nohz.nr_cpus); + } - first_pick_cpu = atomic_read(&nohz.first_pick_cpu); - second_pick_cpu = atomic_read(&nohz.second_pick_cpu); + /* + * None are in tickless mode and hence no need for NOHZ idle load + * balancing. + */ + if (likely(!atomic_read(&nohz.nr_cpus))) + return 0; - if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && - second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) + if (time_before(now, nohz.next_balance)) return 0; - ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); - if (ret == nr_cpu_ids || ret == cpu) { - atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); - if (rq->nr_running > 1) - return 1; - } else { - ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); - if (ret == nr_cpu_ids || ret == cpu) { - if (rq->nr_running) - return 1; - } + if (rq->nr_running >= 2) + goto need_kick; + + rcu_read_lock(); + for_each_domain(cpu, sd) { + struct sched_group *sg = sd->groups; + struct sched_group_power *sgp = sg->sgp; + int nr_busy = atomic_read(&sgp->nr_busy_cpus); + + if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) + goto need_kick_unlock; + + if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight + && (cpumask_first_and(nohz.idle_cpus_mask, + sched_domain_span(sd)) < cpu)) + goto need_kick_unlock; + + if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING))) + break; } + rcu_read_unlock(); return 0; + +need_kick_unlock: + rcu_read_unlock(); +need_kick: + return 1; } #else static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } @@ -4826,14 +5143,14 @@ static inline int on_null_domain(int cpu) /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ -static inline void trigger_load_balance(struct rq *rq, int cpu) +void trigger_load_balance(struct rq *rq, int cpu) { /* Don't need to rebalance while attached to NULL domain */ if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); #ifdef CONFIG_NO_HZ - else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) + if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) nohz_balancer_kick(cpu); #endif } @@ -4848,15 +5165,6 @@ static void rq_offline_fair(struct rq *rq) update_sysctl(); } -#else /* CONFIG_SMP */ - -/* - * on UP we do not need to balance between CPUs: - */ -static inline void idle_balance(int cpu, struct rq *rq) -{ -} - #endif /* CONFIG_SMP */ /* @@ -4880,8 +5188,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) */ static void task_fork_fair(struct task_struct *p) { - struct cfs_rq *cfs_rq = task_cfs_rq(current); - struct sched_entity *se = &p->se, *curr = cfs_rq->curr; + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se, *curr; int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); unsigned long flags; @@ -4890,6 +5198,9 @@ static void task_fork_fair(struct task_struct *p) update_rq_clock(rq); + cfs_rq = task_cfs_rq(current); + curr = cfs_rq->curr; + if (unlikely(task_cpu(p) != this_cpu)) { rcu_read_lock(); __set_task_cpu(p, this_cpu); @@ -4999,6 +5310,16 @@ static void set_curr_task_fair(struct rq *rq) } } +void init_cfs_rq(struct cfs_rq *cfs_rq) +{ + cfs_rq->tasks_timeline = RB_ROOT; + INIT_LIST_HEAD(&cfs_rq->tasks); + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); +#ifndef CONFIG_64BIT + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif +} + #ifdef CONFIG_FAIR_GROUP_SCHED static void task_move_group_fair(struct task_struct *p, int on_rq) { @@ -5015,13 +5336,182 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * to another cgroup's rq. This does somewhat interfere with the * fair sleeper stuff for the first placement, but who cares. */ + /* + * When !on_rq, vruntime of the task has usually NOT been normalized. + * But there are some cases where it has already been normalized: + * + * - Moving a forked child which is waiting for being woken up by + * wake_up_new_task(). + * - Moving a task which has been woken up by try_to_wake_up() and + * waiting for actually being woken up by sched_ttwu_pending(). + * + * To prevent boost or penalty in the new cfs_rq caused by delta + * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. + */ + if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING)) + on_rq = 1; + if (!on_rq) p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; set_task_rq(p, task_cpu(p)); if (!on_rq) p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; } + +void free_fair_sched_group(struct task_group *tg) +{ + int i; + + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + + for_each_possible_cpu(i) { + if (tg->cfs_rq) + kfree(tg->cfs_rq[i]); + if (tg->se) + kfree(tg->se[i]); + } + + kfree(tg->cfs_rq); + kfree(tg->se); +} + +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se; + int i; + + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); + if (!tg->cfs_rq) + goto err; + tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); + if (!tg->se) + goto err; + + tg->shares = NICE_0_LOAD; + + init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + + for_each_possible_cpu(i) { + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL, cpu_to_node(i)); + if (!cfs_rq) + goto err; + + se = kzalloc_node(sizeof(struct sched_entity), + GFP_KERNEL, cpu_to_node(i)); + if (!se) + goto err_free_rq; + + init_cfs_rq(cfs_rq); + init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); + } + + return 1; + +err_free_rq: + kfree(cfs_rq); +err: + return 0; +} + +void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, + struct sched_entity *parent) +{ + struct rq *rq = cpu_rq(cpu); + + cfs_rq->tg = tg; + cfs_rq->rq = rq; +#ifdef CONFIG_SMP + /* allow initial update_cfs_load() to truncate */ + cfs_rq->load_stamp = 1; #endif + init_cfs_rq_runtime(cfs_rq); + + tg->cfs_rq[cpu] = cfs_rq; + tg->se[cpu] = se; + + /* se could be NULL for root_task_group */ + if (!se) + return; + + if (!parent) + se->cfs_rq = &rq->cfs; + else + se->cfs_rq = parent->my_q; + + se->my_q = cfs_rq; + update_load_set(&se->load, 0); + se->parent = parent; +} + +static DEFINE_MUTEX(shares_mutex); + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + int i; + unsigned long flags; + + /* + * We can't change the weight of the root cgroup. + */ + if (!tg->se[0]) + return -EINVAL; + + shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); + + mutex_lock(&shares_mutex); + if (tg->shares == shares) + goto done; + + tg->shares = shares; + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + struct sched_entity *se; + + se = tg->se[i]; + /* Propagate contribution to hierarchy */ + raw_spin_lock_irqsave(&rq->lock, flags); + for_each_sched_entity(se) + update_cfs_shares(group_cfs_rq(se)); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + +done: + mutex_unlock(&shares_mutex); + return 0; +} +#else /* CONFIG_FAIR_GROUP_SCHED */ + +void free_fair_sched_group(struct task_group *tg) { } + +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} + +void unregister_fair_sched_group(struct task_group *tg, int cpu) { } + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) { @@ -5041,7 +5531,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task /* * All the scheduling class methods: */ -static const struct sched_class fair_sched_class = { +const struct sched_class fair_sched_class = { .next = &idle_sched_class, .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, @@ -5078,7 +5568,7 @@ static const struct sched_class fair_sched_class = { }; #ifdef CONFIG_SCHED_DEBUG -static void print_cfs_stats(struct seq_file *m, int cpu) +void print_cfs_stats(struct seq_file *m, int cpu) { struct cfs_rq *cfs_rq; @@ -5088,3 +5578,15 @@ static void print_cfs_stats(struct seq_file *m, int cpu) rcu_read_unlock(); } #endif + +__init void init_sched_fair_class(void) +{ +#ifdef CONFIG_SMP + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); + +#ifdef CONFIG_NO_HZ + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); +#endif +#endif /* SMP */ + +} diff --git a/kernel/sched_features.h b/kernel/sched/features.h index 84802245abd..e61fd73913d 100644 --- a/kernel/sched_features.h +++ b/kernel/sched/features.h @@ -3,13 +3,13 @@ * them to run sooner, but does not allow tons of sleepers to * rip the spread apart. */ -SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) /* * Place new tasks ahead so that they do not starve already running * tasks */ -SCHED_FEAT(START_DEBIT, 1) +SCHED_FEAT(START_DEBIT, true) /* * Based on load and program behaviour, see if it makes sense to place @@ -17,54 +17,54 @@ SCHED_FEAT(START_DEBIT, 1) * improve cache locality. Typically used with SYNC wakeups as * generated by pipes and the like, see also SYNC_WAKEUPS. */ -SCHED_FEAT(AFFINE_WAKEUPS, 1) +SCHED_FEAT(AFFINE_WAKEUPS, true) /* * Prefer to schedule the task we woke last (assuming it failed * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. */ -SCHED_FEAT(NEXT_BUDDY, 0) +SCHED_FEAT(NEXT_BUDDY, false) /* * Prefer to schedule the task that ran last (when we did * wake-preempt) as that likely will touch the same data, increases * cache locality. */ -SCHED_FEAT(LAST_BUDDY, 1) +SCHED_FEAT(LAST_BUDDY, true) /* * Consider buddies to be cache hot, decreases the likelyness of a * cache buddy being migrated away, increases cache locality. */ -SCHED_FEAT(CACHE_HOT_BUDDY, 1) +SCHED_FEAT(CACHE_HOT_BUDDY, true) /* * Use arch dependent cpu power functions */ -SCHED_FEAT(ARCH_POWER, 0) +SCHED_FEAT(ARCH_POWER, false) -SCHED_FEAT(HRTICK, 0) -SCHED_FEAT(DOUBLE_TICK, 0) -SCHED_FEAT(LB_BIAS, 1) +SCHED_FEAT(HRTICK, false) +SCHED_FEAT(DOUBLE_TICK, false) +SCHED_FEAT(LB_BIAS, true) /* * Spin-wait on mutex acquisition when the mutex owner is running on * another cpu -- assumes that when the owner is running, it will soon * release the lock. Decreases scheduling overhead. */ -SCHED_FEAT(OWNER_SPIN, 1) +SCHED_FEAT(OWNER_SPIN, true) /* * Decrement CPU power based on time not spent running tasks */ -SCHED_FEAT(NONTASK_POWER, 1) +SCHED_FEAT(NONTASK_POWER, true) /* * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ -SCHED_FEAT(TTWU_QUEUE, 1) +SCHED_FEAT(TTWU_QUEUE, true) -SCHED_FEAT(FORCE_SD_OVERLAP, 0) -SCHED_FEAT(RT_RUNTIME_SHARE, 1) +SCHED_FEAT(FORCE_SD_OVERLAP, false) +SCHED_FEAT(RT_RUNTIME_SHARE, true) diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c index 0a51882534e..91b4c957f28 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched/idle_task.c @@ -1,3 +1,5 @@ +#include "sched.h" + /* * idle-task scheduling class. * @@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task /* * Simple, special scheduling class for the per-CPU idle tasks: */ -static const struct sched_class idle_sched_class = { +const struct sched_class idle_sched_class = { /* .next is NULL */ /* no enqueue/yield_task for idle tasks */ diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c index 583a1368afe..3640ebbb466 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched/rt.c @@ -3,7 +3,92 @@ * policies) */ +#include "sched.h" + +#include <linux/slab.h> + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + +struct rt_bandwidth def_rt_bandwidth; + +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_bandwidth *rt_b = + container_of(timer, struct rt_bandwidth, rt_period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, rt_b->rt_period); + + if (!overrun) + break; + + idle = do_sched_rt_period_timer(rt_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) +{ + rt_b->rt_period = ns_to_ktime(period); + rt_b->rt_runtime = runtime; + + raw_spin_lock_init(&rt_b->rt_runtime_lock); + + hrtimer_init(&rt_b->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.function = sched_rt_period_timer; +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return; + + if (hrtimer_active(&rt_b->rt_period_timer)) + return; + + raw_spin_lock(&rt_b->rt_runtime_lock); + start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); + raw_spin_unlock(&rt_b->rt_runtime_lock); +} + +void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +{ + struct rt_prio_array *array; + int i; + + array = &rt_rq->active; + for (i = 0; i < MAX_RT_PRIO; i++) { + INIT_LIST_HEAD(array->queue + i); + __clear_bit(i, array->bitmap); + } + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); + +#if defined CONFIG_SMP + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->highest_prio.next = MAX_RT_PRIO; + rt_rq->rt_nr_migratory = 0; + rt_rq->overloaded = 0; + plist_head_init(&rt_rq->pushable_tasks); +#endif + + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + raw_spin_lock_init(&rt_rq->rt_runtime_lock); +} + #ifdef CONFIG_RT_GROUP_SCHED +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + hrtimer_cancel(&rt_b->rt_period_timer); +} #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) @@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) return rt_se->rt_rq; } +void free_rt_sched_group(struct task_group *tg) +{ + int i; + + if (tg->rt_se) + destroy_rt_bandwidth(&tg->rt_bandwidth); + + for_each_possible_cpu(i) { + if (tg->rt_rq) + kfree(tg->rt_rq[i]); + if (tg->rt_se) + kfree(tg->rt_se[i]); + } + + kfree(tg->rt_rq); + kfree(tg->rt_se); +} + +void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, + struct sched_rt_entity *parent) +{ + struct rq *rq = cpu_rq(cpu); + + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->rt_nr_boosted = 0; + rt_rq->rq = rq; + rt_rq->tg = tg; + + tg->rt_rq[cpu] = rt_rq; + tg->rt_se[cpu] = rt_se; + + if (!rt_se) + return; + + if (!parent) + rt_se->rt_rq = &rq->rt; + else + rt_se->rt_rq = parent->my_q; + + rt_se->my_q = rt_rq; + rt_se->parent = parent; + INIT_LIST_HEAD(&rt_se->run_list); +} + +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ + struct rt_rq *rt_rq; + struct sched_rt_entity *rt_se; + int i; + + tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); + if (!tg->rt_rq) + goto err; + tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); + if (!tg->rt_se) + goto err; + + init_rt_bandwidth(&tg->rt_bandwidth, + ktime_to_ns(def_rt_bandwidth.rt_period), 0); + + for_each_possible_cpu(i) { + rt_rq = kzalloc_node(sizeof(struct rt_rq), + GFP_KERNEL, cpu_to_node(i)); + if (!rt_rq) + goto err; + + rt_se = kzalloc_node(sizeof(struct sched_rt_entity), + GFP_KERNEL, cpu_to_node(i)); + if (!rt_se) + goto err_free_rq; + + init_rt_rq(rt_rq, cpu_rq(i)); + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; + init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); + } + + return 1; + +err_free_rq: + kfree(rt_rq); +err: + return 0; +} + #else /* CONFIG_RT_GROUP_SCHED */ #define rt_entity_is_task(rt_se) (1) @@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) return &rq->rt; } +void free_rt_sched_group(struct task_group *tg) { } + +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_SMP @@ -556,6 +732,28 @@ static void enable_runtime(struct rq *rq) raw_spin_unlock_irqrestore(&rq->lock, flags); } +int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + disable_runtime(cpu_rq(cpu)); + return NOTIFY_OK; + + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + enable_runtime(cpu_rq(cpu)); + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} + static int balance_runtime(struct rt_rq *rt_rq) { int more = 0; @@ -648,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); - if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) + if (runtime >= sched_rt_period(rt_rq)) return 0; balance_runtime(rt_rq); @@ -957,8 +1155,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) } /* - * Put task to the end of the run list without the overhead of dequeue - * followed by enqueue. + * Put task to the head or the end of the run list without the overhead of + * dequeue followed by enqueue. */ static void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) @@ -1002,6 +1200,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) cpu = task_cpu(p); + if (p->rt.nr_cpus_allowed == 1) + goto out; + /* For anything but wake ups, just return the task_cpu */ if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) goto out; @@ -1178,8 +1379,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) /* Only try algorithms three times */ #define RT_MAX_TRIES 3 -static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); - static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && @@ -1653,13 +1852,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) pull_rt_task(rq); } -static inline void init_sched_rt_class(void) +void init_sched_rt_class(void) { unsigned int i; - for_each_possible_cpu(i) + for_each_possible_cpu(i) { zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), GFP_KERNEL, cpu_to_node(i)); + } } #endif /* CONFIG_SMP */ @@ -1800,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } -static const struct sched_class rt_sched_class = { +const struct sched_class rt_sched_class = { .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, @@ -1835,7 +2035,7 @@ static const struct sched_class rt_sched_class = { #ifdef CONFIG_SCHED_DEBUG extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); -static void print_rt_stats(struct seq_file *m, int cpu) +void print_rt_stats(struct seq_file *m, int cpu) { rt_rq_iter_t iter; struct rt_rq *rt_rq; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h new file mode 100644 index 00000000000..98c0c2623db --- /dev/null +++ b/kernel/sched/sched.h @@ -0,0 +1,1166 @@ + +#include <linux/sched.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/stop_machine.h> + +#include "cpupri.h" + +extern __read_mostly int scheduler_running; + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + +/* + * Helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) + +#define NICE_0_LOAD SCHED_LOAD_SCALE +#define NICE_0_SHIFT SCHED_LOAD_SHIFT + +/* + * These are the 'tuning knobs' of the scheduler: + * + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +#define DEF_TIMESLICE (100 * HZ / 1000) + +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + +static inline int rt_policy(int policy) +{ + if (policy == SCHED_FIFO || policy == SCHED_RR) + return 1; + return 0; +} + +static inline int task_has_rt_policy(struct task_struct *p) +{ + return rt_policy(p->policy); +} + +/* + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +struct rt_bandwidth { + /* nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +}; + +extern struct mutex sched_domains_mutex; + +#ifdef CONFIG_CGROUP_SCHED + +#include <linux/cgroup.h> + +struct cfs_rq; +struct rt_rq; + +static LIST_HEAD(task_groups); + +struct cfs_bandwidth { +#ifdef CONFIG_CFS_BANDWIDTH + raw_spinlock_t lock; + ktime_t period; + u64 quota, runtime; + s64 hierarchal_quota; + u64 runtime_expires; + + int idle, timer_active; + struct hrtimer period_timer, slack_timer; + struct list_head throttled_cfs_rq; + + /* statistics */ + int nr_periods, nr_throttled; + u64 throttled_time; +#endif +}; + +/* task group related information */ +struct task_group { + struct cgroup_subsys_state css; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* schedulable entities of this group on each cpu */ + struct sched_entity **se; + /* runqueue "owned" by this group on each cpu */ + struct cfs_rq **cfs_rq; + unsigned long shares; + + atomic_t load_weight; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + struct rt_bandwidth rt_bandwidth; +#endif + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif + + struct cfs_bandwidth cfs_bandwidth; +}; + +#ifdef CONFIG_FAIR_GROUP_SCHED +#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD + +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) +#endif + +/* Default task group. + * Every task in system belong to this group at bootup. + */ +extern struct task_group root_task_group; + +typedef int (*tg_visitor)(struct task_group *, void *); + +extern int walk_tg_tree_from(struct task_group *from, + tg_visitor down, tg_visitor up, void *data); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + * + * Caller must hold rcu_lock or sufficient equivalent. + */ +static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + return walk_tg_tree_from(&root_task_group, down, up, data); +} + +extern int tg_nop(struct task_group *tg, void *data); + +extern void free_fair_sched_group(struct task_group *tg); +extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void unregister_fair_sched_group(struct task_group *tg, int cpu); +extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, + struct sched_entity *parent); +extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); + +extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); +extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); +extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); + +extern void free_rt_sched_group(struct task_group *tg); +extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent); +extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, + struct sched_rt_entity *parent); + +#else /* CONFIG_CGROUP_SCHED */ + +struct cfs_bandwidth { }; + +#endif /* CONFIG_CGROUP_SCHED */ + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned long nr_running, h_nr_running; + + u64 exec_clock; + u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif + + struct rb_root tasks_timeline; + struct rb_node *rb_leftmost; + + struct list_head tasks; + struct list_head *balance_iterator; + + /* + * 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr, *next, *last, *skip; + +#ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_SMP + /* + * the part of load.weight contributed by tasks + */ + unsigned long task_weight; + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; + + /* + * Maintaining per-cpu shares distribution for group scheduling + * + * load_stamp is the last time we updated the load average + * load_last is the last time we updated the load average and saw load + * load_unacc_exec_time is currently unaccounted execution time + */ + u64 load_avg; + u64 load_period; + u64 load_stamp, load_last, load_unacc_exec_time; + + unsigned long load_contribution; +#endif /* CONFIG_SMP */ +#ifdef CONFIG_CFS_BANDWIDTH + int runtime_enabled; + u64 runtime_expires; + s64 runtime_remaining; + + u64 throttled_timestamp; + int throttled, throttle_count; + struct list_head throttled_list; +#endif /* CONFIG_CFS_BANDWIDTH */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ +}; + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + unsigned long rt_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + struct { + int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP + int next; /* next highest */ +#endif + } highest_prio; +#endif +#ifdef CONFIG_SMP + unsigned long rt_nr_migratory; + unsigned long rt_nr_total; + int overloaded; + struct plist_head pushable_tasks; +#endif + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + + struct rq *rq; + struct list_head leaf_rt_rq_list; + struct task_group *tg; +#endif +}; + +#ifdef CONFIG_SMP + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member cpus from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + atomic_t rto_count; + struct rcu_head rcu; + cpumask_var_t span; + cpumask_var_t online; + + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_var_t rto_mask; + struct cpupri cpupri; +}; + +extern struct root_domain def_root_domain; + +#endif /* CONFIG_SMP */ + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { + /* runqueue lock: */ + raw_spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; + #define CPU_LOAD_IDX_MAX 5 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long last_load_update_tick; +#ifdef CONFIG_NO_HZ + u64 nohz_stamp; + unsigned long nohz_flags; +#endif + int skip_clock_update; + + /* capture load from *all* tasks on this cpu: */ + struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; + + struct cfs_rq cfs; + struct rt_rq rt; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this cpu: */ + struct list_head leaf_cfs_rq_list; +#endif +#ifdef CONFIG_RT_GROUP_SCHED + struct list_head leaf_rt_rq_list; +#endif + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + struct task_struct *curr, *idle, *stop; + unsigned long next_balance; + struct mm_struct *prev_mm; + + u64 clock; + u64 clock_task; + + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct root_domain *rd; + struct sched_domain *sd; + + unsigned long cpu_power; + + unsigned char idle_balance; + /* For active balancing */ + int post_schedule; + int active_balance; + int push_cpu; + struct cpu_stop_work active_balance_work; + /* cpu of this runqueue: */ + int cpu; + int online; + + u64 rt_avg; + u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif +#ifdef CONFIG_PARAVIRT + u64 prev_steal_time; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + u64 prev_steal_time_rq; +#endif + + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + +#ifdef CONFIG_SCHED_HRTICK +#ifdef CONFIG_SMP + int hrtick_csd_pending; + struct call_single_data hrtick_csd; +#endif + struct hrtimer hrtick_timer; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_count; + + /* schedule() stats */ + unsigned int sched_switch; + unsigned int sched_count; + unsigned int sched_goidle; + + /* try_to_wake_up() stats */ + unsigned int ttwu_count; + unsigned int ttwu_local; +#endif + +#ifdef CONFIG_SMP + struct llist_head wake_list; +#endif +}; + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + +DECLARE_PER_CPU(struct rq, runqueues); + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() (&__raw_get_cpu_var(runqueues)) + +#ifdef CONFIG_SMP + +#define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + lockdep_is_held(&sched_domains_mutex)) + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +#define for_each_domain(cpu, __sd) \ + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ + __sd; __sd = __sd->parent) + +#define for_each_lower_domain(sd) for (; sd; sd = sd->child) + +/** + * highest_flag_domain - Return highest sched_domain containing flag. + * @cpu: The cpu whose highest level of sched domain is to + * be returned. + * @flag: The flag to check for the highest sched_domain + * for the given cpu. + * + * Returns the highest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *highest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd, *hsd = NULL; + + for_each_domain(cpu, sd) { + if (!(sd->flags & flag)) + break; + hsd = sd; + } + + return hsd; +} + +DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(int, sd_llc_id); + +#endif /* CONFIG_SMP */ + +#include "stats.h" +#include "auto_group.h" + +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We use task_subsys_state_check() and extend the RCU verification with + * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each + * task it moves into the cgroup. Therefore by holding either of those locks, + * we pin the task to the current cgroup. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + struct task_group *tg; + struct cgroup_subsys_state *css; + + css = task_subsys_state_check(p, cpu_cgroup_subsys_id, + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock)); + tg = container_of(css, struct task_group, css); + + return autogroup_task_group(p, tg); +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED) + struct task_group *tg = task_group(p); +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = tg->cfs_rq[cpu]; + p->se.parent = tg->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = tg->rt_rq[cpu]; + p->rt.parent = tg->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; +#endif +} + +/* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# include <linux/jump_label.h> +# define const_debug __read_mostly +#else +# define const_debug const +#endif + +extern const_debug unsigned int sysctl_sched_features; + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + +enum { +#include "features.h" + __SCHED_FEAT_NR, +}; + +#undef SCHED_FEAT + +#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) +static __always_inline bool static_branch__true(struct jump_label_key *key) +{ + return likely(static_branch(key)); /* Not out of line branch. */ +} + +static __always_inline bool static_branch__false(struct jump_label_key *key) +{ + return unlikely(static_branch(key)); /* Out of line branch. */ +} + +#define SCHED_FEAT(name, enabled) \ +static __always_inline bool static_branch_##name(struct jump_label_key *key) \ +{ \ + return static_branch__##enabled(key); \ +} + +#include "features.h" + +#undef SCHED_FEAT + +extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; +#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) +#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) +#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ + +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + + + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->on_cpu; +#else + return task_current(rq, p); +#endif +} + + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + + raw_spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + raw_spin_unlock_irq(&rq->lock); +#else + raw_spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ + + +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 + +/* + * Nice levels are multiplicative, with a gentle 10% change for every + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to + * nice 1, it will get ~10% less CPU time than another CPU-bound task + * that remained on nice 0. + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) + */ +static const int prio_to_weight[40] = { + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, +}; + +/* + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ +static const u32 prio_to_wmult[40] = { + /* -20 */ 48388, 59856, 76040, 92818, 118348, + /* -15 */ 147320, 184698, 229616, 287308, 360437, + /* -10 */ 449829, 563644, 704093, 875809, 1099582, + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, +}; + +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + + +#define sched_class_highest (&stop_sched_class) +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) + +extern const struct sched_class stop_sched_class; +extern const struct sched_class rt_sched_class; +extern const struct sched_class fair_sched_class; +extern const struct sched_class idle_sched_class; + + +#ifdef CONFIG_SMP + +extern void trigger_load_balance(struct rq *rq, int cpu); +extern void idle_balance(int this_cpu, struct rq *this_rq); + +#else /* CONFIG_SMP */ + +static inline void idle_balance(int cpu, struct rq *rq) +{ +} + +#endif + +extern void sysrq_sched_debug_show(void); +extern void sched_init_granularity(void); +extern void update_max_interval(void); +extern void update_group_power(struct sched_domain *sd, int cpu); +extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu); +extern void init_sched_rt_class(void); +extern void init_sched_fair_class(void); + +extern void resched_task(struct task_struct *p); +extern void resched_cpu(int cpu); + +extern struct rt_bandwidth def_rt_bandwidth; +extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); + +extern void update_cpu_load(struct rq *this_rq); + +#ifdef CONFIG_CGROUP_CPUACCT +#include <linux/cgroup.h> +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 __percpu *cpuusage; + struct kernel_cpustat __percpu *cpustat; +}; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *parent_ca(struct cpuacct *ca) +{ + if (!ca || !ca->css.cgroup->parent) + return NULL; + return cgroup_ca(ca->css.cgroup->parent); +} + +extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); +#else +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} +#endif + +static inline void inc_nr_running(struct rq *rq) +{ + rq->nr_running++; +} + +static inline void dec_nr_running(struct rq *rq) +{ + rq->nr_running--; +} + +extern void update_rq_clock(struct rq *rq); + +extern void activate_task(struct rq *rq, struct task_struct *p, int flags); +extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + +extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); + +extern const_debug unsigned int sysctl_sched_time_avg; +extern const_debug unsigned int sysctl_sched_nr_migrate; +extern const_debug unsigned int sysctl_sched_migration_cost; + +static inline u64 sched_avg_period(void) +{ + return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; +} + +void calc_load_account_idle(struct rq *this_rq); + +#ifdef CONFIG_SCHED_HRTICK + +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + if (!cpu_active(cpu_of(rq))) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +void hrtick_start(struct rq *rq, u64 delay); + +#else + +static inline int hrtick_enabled(struct rq *rq) +{ + return 0; +} + +#endif /* CONFIG_SCHED_HRTICK */ + +#ifdef CONFIG_SMP +extern void sched_avg_update(struct rq *rq); +static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ + rq->rt_avg += rt_delta; + sched_avg_update(rq); +} +#else +static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } +static inline void sched_avg_update(struct rq *rq) { } +#endif + +extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); + +#ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT + +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); + +/* + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + raw_spin_unlock(&this_rq->lock); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower cpu-ids and will + * grant the double lock to lower cpus over higher ids under contention, + * regardless of entry order into the function. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + int ret = 0; + + if (unlikely(!raw_spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + raw_spin_unlock(&this_rq->lock); + raw_spin_lock(&busiest->lock); + raw_spin_lock_nested(&this_rq->lock, + SINGLE_DEPTH_NESTING); + ret = 1; + } else + raw_spin_lock_nested(&busiest->lock, + SINGLE_DEPTH_NESTING); + } + return ret; +} + +#endif /* CONFIG_PREEMPT */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + raw_spin_unlock(&this_rq->lock); + BUG_ON(1); + } + + return _double_lock_balance(this_rq, busiest); +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + raw_spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + } else { + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + raw_spin_unlock(&rq1->lock); + if (rq1 != rq2) + raw_spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_unlock(&rq1->lock); + __release(rq2->lock); +} + +#endif + +extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); +extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); +extern void print_cfs_stats(struct seq_file *m, int cpu); +extern void print_rt_stats(struct seq_file *m, int cpu); + +extern void init_cfs_rq(struct cfs_rq *cfs_rq); +extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); +extern void unthrottle_offline_cfs_rqs(struct rq *rq); + +extern void account_cfs_bandwidth_used(int enabled, int was_enabled); + +#ifdef CONFIG_NO_HZ +enum rq_nohz_flag_bits { + NOHZ_TICK_STOPPED, + NOHZ_BALANCE_KICK, + NOHZ_IDLE, +}; + +#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) +#endif diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c new file mode 100644 index 00000000000..2a581ba8e19 --- /dev/null +++ b/kernel/sched/stats.c @@ -0,0 +1,111 @@ + +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/proc_fs.h> + +#include "sched.h" + +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 15 + +static int show_schedstat(struct seq_file *seq, void *v) +{ + int cpu; + int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; + char *mask_str = kmalloc(mask_len, GFP_KERNEL); + + if (mask_str == NULL) + return -ENOMEM; + + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); +#ifdef CONFIG_SMP + struct sched_domain *sd; + int dcount = 0; +#endif + + /* runqueue-specific stats */ + seq_printf(seq, + "cpu%d %u %u %u %u %u %u %llu %llu %lu", + cpu, rq->yld_count, + rq->sched_switch, rq->sched_count, rq->sched_goidle, + rq->ttwu_count, rq->ttwu_local, + rq->rq_cpu_time, + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); + + seq_printf(seq, "\n"); + +#ifdef CONFIG_SMP + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { + enum cpu_idle_type itype; + + cpumask_scnprintf(mask_str, mask_len, + sched_domain_span(sd)); + seq_printf(seq, "domain%d %s", dcount++, mask_str); + for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; + itype++) { + seq_printf(seq, " %u %u %u %u %u %u %u %u", + sd->lb_count[itype], + sd->lb_balanced[itype], + sd->lb_failed[itype], + sd->lb_imbalance[itype], + sd->lb_gained[itype], + sd->lb_hot_gained[itype], + sd->lb_nobusyq[itype], + sd->lb_nobusyg[itype]); + } + seq_printf(seq, + " %u %u %u %u %u %u %u %u %u %u %u %u\n", + sd->alb_count, sd->alb_failed, sd->alb_pushed, + sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, + sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, + sd->ttwu_wake_remote, sd->ttwu_move_affine, + sd->ttwu_move_balance); + } + rcu_read_unlock(); +#endif + } + kfree(mask_str); + return 0; +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = single_open(file, show_schedstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +static const struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_schedstat_init(void) +{ + proc_create("schedstat", 0, NULL, &proc_schedstat_operations); + return 0; +} +module_init(proc_schedstat_init); diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h index 87f9e36ea56..2ef90a51ec5 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched/stats.h @@ -1,108 +1,5 @@ #ifdef CONFIG_SCHEDSTATS -/* - * bump this up when changing the output format or the meaning of an existing - * format, so that tools can adapt (or abort) - */ -#define SCHEDSTAT_VERSION 15 - -static int show_schedstat(struct seq_file *seq, void *v) -{ - int cpu; - int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; - char *mask_str = kmalloc(mask_len, GFP_KERNEL); - - if (mask_str == NULL) - return -ENOMEM; - - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); -#ifdef CONFIG_SMP - struct sched_domain *sd; - int dcount = 0; -#endif - - /* runqueue-specific stats */ - seq_printf(seq, - "cpu%d %u %u %u %u %u %u %llu %llu %lu", - cpu, rq->yld_count, - rq->sched_switch, rq->sched_count, rq->sched_goidle, - rq->ttwu_count, rq->ttwu_local, - rq->rq_cpu_time, - rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); - - seq_printf(seq, "\n"); - -#ifdef CONFIG_SMP - /* domain-specific stats */ - rcu_read_lock(); - for_each_domain(cpu, sd) { - enum cpu_idle_type itype; - - cpumask_scnprintf(mask_str, mask_len, - sched_domain_span(sd)); - seq_printf(seq, "domain%d %s", dcount++, mask_str); - for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; - itype++) { - seq_printf(seq, " %u %u %u %u %u %u %u %u", - sd->lb_count[itype], - sd->lb_balanced[itype], - sd->lb_failed[itype], - sd->lb_imbalance[itype], - sd->lb_gained[itype], - sd->lb_hot_gained[itype], - sd->lb_nobusyq[itype], - sd->lb_nobusyg[itype]); - } - seq_printf(seq, - " %u %u %u %u %u %u %u %u %u %u %u %u\n", - sd->alb_count, sd->alb_failed, sd->alb_pushed, - sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, - sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, - sd->ttwu_wake_remote, sd->ttwu_move_affine, - sd->ttwu_move_balance); - } - rcu_read_unlock(); -#endif - } - kfree(mask_str); - return 0; -} - -static int schedstat_open(struct inode *inode, struct file *file) -{ - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; - - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; -} - -static const struct file_operations proc_schedstat_operations = { - .open = schedstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init proc_schedstat_init(void) -{ - proc_create("schedstat", 0, NULL, &proc_schedstat_operations); - return 0; -} -module_init(proc_schedstat_init); /* * Expects runqueue lock to be held for atomicity of update @@ -283,8 +180,7 @@ static inline void account_group_user_time(struct task_struct *tsk, return; raw_spin_lock(&cputimer->lock); - cputimer->cputime.utime = - cputime_add(cputimer->cputime.utime, cputime); + cputimer->cputime.utime += cputime; raw_spin_unlock(&cputimer->lock); } @@ -307,8 +203,7 @@ static inline void account_group_system_time(struct task_struct *tsk, return; raw_spin_lock(&cputimer->lock); - cputimer->cputime.stime = - cputime_add(cputimer->cputime.stime, cputime); + cputimer->cputime.stime += cputime; raw_spin_unlock(&cputimer->lock); } diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c index 8b44e7fa7fb..7b386e86fd2 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched/stop_task.c @@ -1,3 +1,5 @@ +#include "sched.h" + /* * stop-task scheduling class. * @@ -80,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task) /* * Simple, special scheduling class for the per-CPU stop tasks: */ -static const struct sched_class stop_sched_class = { +const struct sched_class stop_sched_class = { .next = &rt_sched_class, .enqueue_task = enqueue_task_stop, diff --git a/kernel/signal.c b/kernel/signal.c index 206551563cc..56ce3a618b2 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1629,10 +1629,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig) info.si_uid = __task_cred(tsk)->uid; rcu_read_unlock(); - info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, - tsk->signal->utime)); - info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, - tsk->signal->stime)); + info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); + info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime); info.si_status = tsk->exit_code & 0x7f; if (tsk->exit_code & 0x80) diff --git a/kernel/sys.c b/kernel/sys.c index 481611fbd07..ddf8155bf3f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1605,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) unsigned long maxrss = 0; memset((char *) r, 0, sizeof *r); - utime = stime = cputime_zero; + utime = stime = 0; if (who == RUSAGE_THREAD) { task_times(current, &utime, &stime); @@ -1635,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) case RUSAGE_SELF: thread_group_times(p, &tgutime, &tgstime); - utime = cputime_add(utime, tgutime); - stime = cputime_add(stime, tgstime); + utime += tgutime; + stime += tgstime; r->ru_nvcsw += p->signal->nvcsw; r->ru_nivcsw += p->signal->nivcsw; r->ru_minflt += p->signal->min_flt; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0ec8b832ab6..7656642e4b8 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -466,6 +466,14 @@ void tick_nohz_idle_enter(void) WARN_ON_ONCE(irqs_disabled()); + /* + * Update the idle state in the scheduler domain hierarchy + * when tick_nohz_stop_sched_tick() is called from the idle loop. + * State will be updated to busy during the first busy tick after + * exiting idle. + */ + set_cpu_sd_state_idle(); + local_irq_disable(); ts = &__get_cpu_var(tick_cpu_sched); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 237841378c0..0c635818640 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void) /* calculate the delta since the last update_wall_time: */ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; - /* return delta convert to nanoseconds using ntp adjusted mult. */ + /* return delta convert to nanoseconds. */ return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); } @@ -813,11 +813,11 @@ static void timekeeping_adjust(s64 offset) * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. * * Note we subtract one in the shift, so that error is really error*2. - * This "saves" dividing(shifting) intererval twice, but keeps the - * (error > interval) comparision as still measuring if error is + * This "saves" dividing(shifting) interval twice, but keeps the + * (error > interval) comparison as still measuring if error is * larger then half an interval. * - * Note: It does not "save" on aggrivation when reading the code. + * Note: It does not "save" on aggravation when reading the code. */ error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); if (error > interval) { @@ -833,7 +833,7 @@ static void timekeeping_adjust(s64 offset) * nanosecond, and store the amount rounded up into * the error. This causes the likely below to be unlikely. * - * The properfix is to avoid rounding up by using + * The proper fix is to avoid rounding up by using * the high precision timekeeper.xtime_nsec instead of * xtime.tv_nsec everywhere. Fixing this will take some * time. diff --git a/kernel/timer.c b/kernel/timer.c index 9c3c62b0c4b..a297ffcf888 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -427,6 +427,12 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state) } } +/* Stub timer callback for improperly used timers. */ +static void stub_timer(unsigned long data) +{ + WARN_ON(1); +} + /* * fixup_activate is called when: * - an active object is activated @@ -450,7 +456,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) debug_object_activate(timer, &timer_debug_descr); return 0; } else { - WARN_ON_ONCE(1); + setup_timer(timer, stub_timer, 0); + return 1; } return 0; @@ -480,12 +487,40 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state) } } +/* + * fixup_assert_init is called when: + * - an untracked/uninit-ed object is found + */ +static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_NOTAVAILABLE: + if (timer->entry.prev == TIMER_ENTRY_STATIC) { + /* + * This is not really a fixup. The timer was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + debug_object_init(timer, &timer_debug_descr); + return 0; + } else { + setup_timer(timer, stub_timer, 0); + return 1; + } + default: + return 0; + } +} + static struct debug_obj_descr timer_debug_descr = { - .name = "timer_list", - .debug_hint = timer_debug_hint, - .fixup_init = timer_fixup_init, - .fixup_activate = timer_fixup_activate, - .fixup_free = timer_fixup_free, + .name = "timer_list", + .debug_hint = timer_debug_hint, + .fixup_init = timer_fixup_init, + .fixup_activate = timer_fixup_activate, + .fixup_free = timer_fixup_free, + .fixup_assert_init = timer_fixup_assert_init, }; static inline void debug_timer_init(struct timer_list *timer) @@ -508,6 +543,11 @@ static inline void debug_timer_free(struct timer_list *timer) debug_object_free(timer, &timer_debug_descr); } +static inline void debug_timer_assert_init(struct timer_list *timer) +{ + debug_object_assert_init(timer, &timer_debug_descr); +} + static void __init_timer(struct timer_list *timer, const char *name, struct lock_class_key *key); @@ -531,6 +571,7 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack); static inline void debug_timer_init(struct timer_list *timer) { } static inline void debug_timer_activate(struct timer_list *timer) { } static inline void debug_timer_deactivate(struct timer_list *timer) { } +static inline void debug_timer_assert_init(struct timer_list *timer) { } #endif static inline void debug_init(struct timer_list *timer) @@ -552,6 +593,11 @@ static inline void debug_deactivate(struct timer_list *timer) trace_timer_cancel(timer); } +static inline void debug_assert_init(struct timer_list *timer) +{ + debug_timer_assert_init(timer); +} + static void __init_timer(struct timer_list *timer, const char *name, struct lock_class_key *key) @@ -902,6 +948,8 @@ int del_timer(struct timer_list *timer) unsigned long flags; int ret = 0; + debug_assert_init(timer); + timer_stats_timer_clear_start_info(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); @@ -932,6 +980,8 @@ int try_to_del_timer_sync(struct timer_list *timer) unsigned long flags; int ret = -1; + debug_assert_init(timer); + base = lock_timer_base(timer, &flags); if (base->running_timer == timer) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a043d224adf..91dc4bc8bf7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -338,7 +338,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | - TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | + TRACE_ITER_IRQ_INFO; static int trace_stop_count; static DEFINE_RAW_SPINLOCK(tracing_start_lock); @@ -426,6 +427,7 @@ static const char *trace_options[] = { "record-cmd", "overwrite", "disable_on_free", + "irq-info", NULL }; @@ -1843,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p) trace_event_read_unlock(); } +static void +get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) +{ + unsigned long count; + int cpu; + + *total = 0; + *entries = 0; + + for_each_tracing_cpu(cpu) { + count = ring_buffer_entries_cpu(tr->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (tr->data[cpu]->skipped_entries) { + count -= tr->data[cpu]->skipped_entries; + /* total is the same as the entries */ + *total += count; + } else + *total += count + + ring_buffer_overrun_cpu(tr->buffer, cpu); + *entries += count; + } +} + static void print_lat_help_header(struct seq_file *m) { seq_puts(m, "# _------=> CPU# \n"); @@ -1855,12 +1884,35 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# \\ / ||||| \\ | / \n"); } -static void print_func_help_header(struct seq_file *m) +static void print_event_info(struct trace_array *tr, struct seq_file *m) +{ + unsigned long total; + unsigned long entries; + + get_total_entries(tr, &total, &entries); + seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", + entries, total, num_online_cpus()); + seq_puts(m, "#\n"); +} + +static void print_func_help_header(struct trace_array *tr, struct seq_file *m) { - seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); + print_event_info(tr, m); + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); seq_puts(m, "# | | | | |\n"); } +static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) +{ + print_event_info(tr, m); + seq_puts(m, "# _-----=> irqs-off\n"); + seq_puts(m, "# / _----=> need-resched\n"); + seq_puts(m, "# | / _---=> hardirq/softirq\n"); + seq_puts(m, "# || / _--=> preempt-depth\n"); + seq_puts(m, "# ||| / delay\n"); + seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | |||| | |\n"); +} void print_trace_header(struct seq_file *m, struct trace_iterator *iter) @@ -1869,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) struct trace_array *tr = iter->tr; struct trace_array_cpu *data = tr->data[tr->cpu]; struct tracer *type = current_trace; - unsigned long entries = 0; - unsigned long total = 0; - unsigned long count; + unsigned long entries; + unsigned long total; const char *name = "preemption"; - int cpu; if (type) name = type->name; - - for_each_tracing_cpu(cpu) { - count = ring_buffer_entries_cpu(tr->buffer, cpu); - /* - * If this buffer has skipped entries, then we hold all - * entries for the trace and we need to ignore the - * ones before the time stamp. - */ - if (tr->data[cpu]->skipped_entries) { - count -= tr->data[cpu]->skipped_entries; - /* total is the same as the entries */ - total += count; - } else - total += count + - ring_buffer_overrun_cpu(tr->buffer, cpu); - entries += count; - } + get_total_entries(tr, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); @@ -2140,6 +2174,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) return print_trace_fmt(iter); } +void trace_latency_header(struct seq_file *m) +{ + struct trace_iterator *iter = m->private; + + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) + print_trace_header(m, iter); + + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); +} + void trace_default_header(struct seq_file *m) { struct trace_iterator *iter = m->private; @@ -2155,8 +2204,12 @@ void trace_default_header(struct seq_file *m) if (!(trace_flags & TRACE_ITER_VERBOSE)) print_lat_help_header(m); } else { - if (!(trace_flags & TRACE_ITER_VERBOSE)) - print_func_help_header(m); + if (!(trace_flags & TRACE_ITER_VERBOSE)) { + if (trace_flags & TRACE_ITER_IRQ_INFO) + print_func_help_header_irq(iter->tr, m); + else + print_func_help_header(iter->tr, m); + } } } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 092e1f8d18d..2c2657462ac 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); +void trace_latency_header(struct seq_file *m); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); @@ -654,6 +655,7 @@ enum trace_iterator_flags { TRACE_ITER_RECORD_CMD = 0x100000, TRACE_ITER_OVERWRITE = 0x200000, TRACE_ITER_STOP_ON_FREE = 0x400000, + TRACE_ITER_IRQ_INFO = 0x800000, }; /* diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 95dc31efd6d..f04cc3136bd 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -27,6 +27,12 @@ #include "trace.h" #include "trace_output.h" +#define DEFAULT_SYS_FILTER_MESSAGE \ + "### global filter ###\n" \ + "# Use this to set filters for multiple events.\n" \ + "# Only events with the given fields will be affected.\n" \ + "# If no events are modified, an error message will be displayed here" + enum filter_op_ids { OP_OR, @@ -646,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system, if (filter && filter->filter_string) trace_seq_printf(s, "%s\n", filter->filter_string); else - trace_seq_printf(s, "none\n"); + trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n"); mutex_unlock(&event_mutex); } @@ -1838,7 +1844,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system, if (!filter) goto out; - replace_filter_string(filter, filter_string); + /* System filters just show a default message */ + kfree(filter->filter_string); + filter->filter_string = NULL; + /* * No event actually uses the system filter * we can free it without synchronize_sched(). @@ -1848,14 +1857,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, parse_init(ps, filter_ops, filter_string); err = filter_parse(ps); - if (err) { - append_filter_err(ps, system->filter); - goto out; - } + if (err) + goto err_filter; err = replace_system_preds(system, ps, filter_string); if (err) - append_filter_err(ps, system->filter); + goto err_filter; out: filter_opstack_clear(ps); @@ -1865,6 +1872,11 @@ out_unlock: mutex_unlock(&event_mutex); return err; + +err_filter: + replace_filter_string(filter, filter_string); + append_filter_err(ps, system->filter); + goto out; } #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 20dad0d7a16..99d20e92036 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) } static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } -static void irqsoff_print_header(struct seq_file *s) { } static void irqsoff_trace_open(struct trace_iterator *iter) { } static void irqsoff_trace_close(struct trace_iterator *iter) { } + +#ifdef CONFIG_FUNCTION_TRACER +static void irqsoff_print_header(struct seq_file *s) +{ + trace_default_header(s); +} +#else +static void irqsoff_print_header(struct seq_file *s) +{ + trace_latency_header(s); +} +#endif /* CONFIG_FUNCTION_TRACER */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 51999309a6c..0d6ff355594 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter) unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned long secs = (unsigned long)t; char comm[TASK_COMM_LEN]; + int ret; trace_find_cmdline(entry->pid, comm); - return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", - comm, entry->pid, iter->cpu, secs, usec_rem); + ret = trace_seq_printf(s, "%16s-%-5d [%03d] ", + comm, entry->pid, iter->cpu); + if (!ret) + return 0; + + if (trace_flags & TRACE_ITER_IRQ_INFO) { + ret = trace_print_lat_fmt(s, entry); + if (!ret) + return 0; + } + + return trace_seq_printf(s, " %5lu.%06lu: ", + secs, usec_rem); } int trace_print_lat_context(struct trace_iterator *iter) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index e4a70c0c71b..ff791ea48b5 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter) } static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } -static void wakeup_print_header(struct seq_file *s) { } static void wakeup_trace_open(struct trace_iterator *iter) { } static void wakeup_trace_close(struct trace_iterator *iter) { } + +#ifdef CONFIG_FUNCTION_TRACER +static void wakeup_print_header(struct seq_file *s) +{ + trace_default_header(s); +} +#else +static void wakeup_print_header(struct seq_file *s) +{ + trace_latency_header(s); +} +#endif /* CONFIG_FUNCTION_TRACER */ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 5bbfac85866..23b4d784ebd 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk) local_irq_save(flags); time = tsk->stime + tsk->utime; - dtime = cputime_sub(time, tsk->acct_timexpd); + dtime = time - tsk->acct_timexpd; jiffies_to_timeval(cputime_to_jiffies(dtime), &value); delta = value.tv_sec; delta = delta * USEC_PER_SEC + value.tv_usec; diff --git a/kernel/wait.c b/kernel/wait.c index 26fa7797f90..7fdd9eaca2c 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -10,10 +10,10 @@ #include <linux/wait.h> #include <linux/hash.h> -void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) +void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key) { spin_lock_init(&q->lock); - lockdep_set_class(&q->lock, key); + lockdep_set_class_and_name(&q->lock, key, name); INIT_LIST_HEAD(&q->task_list); } diff --git a/lib/debugobjects.c b/lib/debugobjects.c index a78b7c6e042..77cb245f8e7 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -268,12 +268,16 @@ static void debug_print_object(struct debug_obj *obj, char *msg) * Try to repair the damage, so we have a better chance to get useful * debug output. */ -static void +static int debug_object_fixup(int (*fixup)(void *addr, enum debug_obj_state state), void * addr, enum debug_obj_state state) { + int fixed = 0; + if (fixup) - debug_objects_fixups += fixup(addr, state); + fixed = fixup(addr, state); + debug_objects_fixups += fixed; + return fixed; } static void debug_object_is_on_stack(void *addr, int onstack) @@ -386,6 +390,9 @@ void debug_object_activate(void *addr, struct debug_obj_descr *descr) struct debug_bucket *db; struct debug_obj *obj; unsigned long flags; + struct debug_obj o = { .object = addr, + .state = ODEBUG_STATE_NOTAVAILABLE, + .descr = descr }; if (!debug_objects_enabled) return; @@ -425,8 +432,9 @@ void debug_object_activate(void *addr, struct debug_obj_descr *descr) * let the type specific code decide whether this is * true or not. */ - debug_object_fixup(descr->fixup_activate, addr, - ODEBUG_STATE_NOTAVAILABLE); + if (debug_object_fixup(descr->fixup_activate, addr, + ODEBUG_STATE_NOTAVAILABLE)) + debug_print_object(&o, "activate"); } /** @@ -563,6 +571,44 @@ out_unlock: } /** + * debug_object_assert_init - debug checks when object should be init-ed + * @addr: address of the object + * @descr: pointer to an object specific debug description structure + */ +void debug_object_assert_init(void *addr, struct debug_obj_descr *descr) +{ + struct debug_bucket *db; + struct debug_obj *obj; + unsigned long flags; + + if (!debug_objects_enabled) + return; + + db = get_bucket((unsigned long) addr); + + raw_spin_lock_irqsave(&db->lock, flags); + + obj = lookup_object(addr, db); + if (!obj) { + struct debug_obj o = { .object = addr, + .state = ODEBUG_STATE_NOTAVAILABLE, + .descr = descr }; + + raw_spin_unlock_irqrestore(&db->lock, flags); + /* + * Maybe the object is static. Let the type specific + * code decide what to do. + */ + if (debug_object_fixup(descr->fixup_assert_init, addr, + ODEBUG_STATE_NOTAVAILABLE)) + debug_print_object(&o, "assert_init"); + return; + } + + raw_spin_unlock_irqrestore(&db->lock, flags); +} + +/** * debug_object_active_state - debug checks object usage state machine * @addr: address of the object * @descr: pointer to an object specific debug description structure diff --git a/mm/Kconfig b/mm/Kconfig index 011b110365c..e338407f122 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -131,6 +131,12 @@ config SPARSEMEM_VMEMMAP config HAVE_MEMBLOCK boolean +config HAVE_MEMBLOCK_NODE_MAP + boolean + +config ARCH_DISCARD_MEMBLOCK + boolean + config NO_BOOTMEM boolean diff --git a/mm/memblock.c b/mm/memblock.c index 84bec4969ed..77b5f227e1d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -20,12 +20,23 @@ #include <linux/seq_file.h> #include <linux/memblock.h> -struct memblock memblock __initdata_memblock; +static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; +static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; + +struct memblock memblock __initdata_memblock = { + .memory.regions = memblock_memory_init_regions, + .memory.cnt = 1, /* empty dummy entry */ + .memory.max = INIT_MEMBLOCK_REGIONS, + + .reserved.regions = memblock_reserved_init_regions, + .reserved.cnt = 1, /* empty dummy entry */ + .reserved.max = INIT_MEMBLOCK_REGIONS, + + .current_limit = MEMBLOCK_ALLOC_ANYWHERE, +}; int memblock_debug __initdata_memblock; -int memblock_can_resize __initdata_memblock; -static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock; -static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock; +static int memblock_can_resize __initdata_memblock; /* inline so we don't get a warning when pr_debug is compiled out */ static inline const char *memblock_type_name(struct memblock_type *type) @@ -38,20 +49,15 @@ static inline const char *memblock_type_name(struct memblock_type *type) return "unknown"; } -/* - * Address comparison utilities - */ - -static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size) -{ - return addr & ~(size - 1); -} - -static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size) +/* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ +static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) { - return (addr + (size - 1)) & ~(size - 1); + return *size = min(*size, (phys_addr_t)ULLONG_MAX - base); } +/* + * Address comparison utilities + */ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, phys_addr_t base2, phys_addr_t size2) { @@ -73,83 +79,69 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, return (i < type->cnt) ? i : -1; } -/* - * Find, allocate, deallocate or reserve unreserved regions. All allocations - * are top-down. +/** + * memblock_find_in_range_node - find free area in given range and node + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @size: size of free area to find + * @align: alignment of free area to find + * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * + * Find @size free area aligned to @align in the specified range and node. + * + * RETURNS: + * Found address on success, %0 on failure. */ - -static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align) +phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, + phys_addr_t end, phys_addr_t size, + phys_addr_t align, int nid) { - phys_addr_t base, res_base; - long j; - - /* In case, huge size is requested */ - if (end < size) - return MEMBLOCK_ERROR; - - base = memblock_align_down((end - size), align); + phys_addr_t this_start, this_end, cand; + u64 i; - /* Prevent allocations returning 0 as it's also used to - * indicate an allocation failure - */ - if (start == 0) - start = PAGE_SIZE; - - while (start <= base) { - j = memblock_overlaps_region(&memblock.reserved, base, size); - if (j < 0) - return base; - res_base = memblock.reserved.regions[j].base; - if (res_base < size) - break; - base = memblock_align_down(res_base - size, align); - } + /* align @size to avoid excessive fragmentation on reserved array */ + size = round_up(size, align); - return MEMBLOCK_ERROR; -} - -static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size, - phys_addr_t align, phys_addr_t start, phys_addr_t end) -{ - long i; - - BUG_ON(0 == size); - - /* Pump up max_addr */ + /* pump up @end */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) end = memblock.current_limit; - /* We do a top-down search, this tends to limit memory - * fragmentation by keeping early boot allocs near the - * top of memory - */ - for (i = memblock.memory.cnt - 1; i >= 0; i--) { - phys_addr_t memblockbase = memblock.memory.regions[i].base; - phys_addr_t memblocksize = memblock.memory.regions[i].size; - phys_addr_t bottom, top, found; + /* avoid allocating the first page */ + start = max_t(phys_addr_t, start, PAGE_SIZE); + end = max(start, end); - if (memblocksize < size) - continue; - if ((memblockbase + memblocksize) <= start) - break; - bottom = max(memblockbase, start); - top = min(memblockbase + memblocksize, end); - if (bottom >= top) + for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); + + if (this_end < size) continue; - found = memblock_find_region(bottom, top, size, align); - if (found != MEMBLOCK_ERROR) - return found; + + cand = round_down(this_end - size, align); + if (cand >= this_start) + return cand; } - return MEMBLOCK_ERROR; + return 0; } -/* - * Find a free area with specified alignment in a specific range. +/** + * memblock_find_in_range - find free area in given range + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @size: size of free area to find + * @align: alignment of free area to find + * + * Find @size free area aligned to @align in the specified range. + * + * RETURNS: + * Found address on success, %0 on failure. */ -u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align) +phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, + phys_addr_t end, phys_addr_t size, + phys_addr_t align) { - return memblock_find_base(size, align, start, end); + return memblock_find_in_range_node(start, end, size, align, + MAX_NUMNODES); } /* @@ -178,25 +170,21 @@ int __init_memblock memblock_reserve_reserved_regions(void) static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) { - unsigned long i; - - for (i = r; i < type->cnt - 1; i++) { - type->regions[i].base = type->regions[i + 1].base; - type->regions[i].size = type->regions[i + 1].size; - } + type->total_size -= type->regions[r].size; + memmove(&type->regions[r], &type->regions[r + 1], + (type->cnt - (r + 1)) * sizeof(type->regions[r])); type->cnt--; /* Special case for empty arrays */ if (type->cnt == 0) { + WARN_ON(type->total_size != 0); type->cnt = 1; type->regions[0].base = 0; type->regions[0].size = 0; + memblock_set_region_node(&type->regions[0], MAX_NUMNODES); } } -/* Defined below but needed now */ -static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); - static int __init_memblock memblock_double_array(struct memblock_type *type) { struct memblock_region *new_array, *old_array; @@ -226,10 +214,10 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) */ if (use_slab) { new_array = kmalloc(new_size, GFP_KERNEL); - addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array); + addr = new_array ? __pa(new_array) : 0; } else - addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE); - if (addr == MEMBLOCK_ERROR) { + addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); + if (!addr) { pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", memblock_type_name(type), type->max, type->max * 2); return -1; @@ -254,7 +242,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) return 0; /* Add the new reserved region now. Should not fail ! */ - BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size)); + BUG_ON(memblock_reserve(addr, new_size)); /* If the array wasn't our static init one, then free it. We only do * that before SLAB is available as later on, we don't know whether @@ -268,343 +256,514 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) return 0; } -int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, - phys_addr_t addr2, phys_addr_t size2) -{ - return 1; -} - -static long __init_memblock memblock_add_region(struct memblock_type *type, - phys_addr_t base, phys_addr_t size) +/** + * memblock_merge_regions - merge neighboring compatible regions + * @type: memblock type to scan + * + * Scan @type and merge neighboring compatible regions. + */ +static void __init_memblock memblock_merge_regions(struct memblock_type *type) { - phys_addr_t end = base + size; - int i, slot = -1; - - /* First try and coalesce this MEMBLOCK with others */ - for (i = 0; i < type->cnt; i++) { - struct memblock_region *rgn = &type->regions[i]; - phys_addr_t rend = rgn->base + rgn->size; + int i = 0; - /* Exit if there's no possible hits */ - if (rgn->base > end || rgn->size == 0) - break; + /* cnt never goes below 1 */ + while (i < type->cnt - 1) { + struct memblock_region *this = &type->regions[i]; + struct memblock_region *next = &type->regions[i + 1]; - /* Check if we are fully enclosed within an existing - * block - */ - if (rgn->base <= base && rend >= end) - return 0; + if (this->base + this->size != next->base || + memblock_get_region_node(this) != + memblock_get_region_node(next)) { + BUG_ON(this->base + this->size > next->base); + i++; + continue; + } - /* Check if we overlap or are adjacent with the bottom - * of a block. - */ - if (base < rgn->base && end >= rgn->base) { - /* If we can't coalesce, create a new block */ - if (!memblock_memory_can_coalesce(base, size, - rgn->base, - rgn->size)) { - /* Overlap & can't coalesce are mutually - * exclusive, if you do that, be prepared - * for trouble - */ - WARN_ON(end != rgn->base); - goto new_block; - } - /* We extend the bottom of the block down to our - * base - */ - rgn->base = base; - rgn->size = rend - base; + this->size += next->size; + memmove(next, next + 1, (type->cnt - (i + 1)) * sizeof(*next)); + type->cnt--; + } +} - /* Return if we have nothing else to allocate - * (fully coalesced) - */ - if (rend >= end) - return 0; +/** + * memblock_insert_region - insert new memblock region + * @type: memblock type to insert into + * @idx: index for the insertion point + * @base: base address of the new region + * @size: size of the new region + * + * Insert new memblock region [@base,@base+@size) into @type at @idx. + * @type must already have extra room to accomodate the new region. + */ +static void __init_memblock memblock_insert_region(struct memblock_type *type, + int idx, phys_addr_t base, + phys_addr_t size, int nid) +{ + struct memblock_region *rgn = &type->regions[idx]; - /* We continue processing from the end of the - * coalesced block. - */ - base = rend; - size = end - base; - } + BUG_ON(type->cnt >= type->max); + memmove(rgn + 1, rgn, (type->cnt - idx) * sizeof(*rgn)); + rgn->base = base; + rgn->size = size; + memblock_set_region_node(rgn, nid); + type->cnt++; + type->total_size += size; +} - /* Now check if we overlap or are adjacent with the - * top of a block - */ - if (base <= rend && end >= rend) { - /* If we can't coalesce, create a new block */ - if (!memblock_memory_can_coalesce(rgn->base, - rgn->size, - base, size)) { - /* Overlap & can't coalesce are mutually - * exclusive, if you do that, be prepared - * for trouble - */ - WARN_ON(rend != base); - goto new_block; - } - /* We adjust our base down to enclose the - * original block and destroy it. It will be - * part of our new allocation. Since we've - * freed an entry, we know we won't fail - * to allocate one later, so we won't risk - * losing the original block allocation. - */ - size += (base - rgn->base); - base = rgn->base; - memblock_remove_region(type, i--); - } - } +/** + * memblock_add_region - add new memblock region + * @type: memblock type to add new region into + * @base: base address of the new region + * @size: size of the new region + * @nid: nid of the new region + * + * Add new memblock region [@base,@base+@size) into @type. The new region + * is allowed to overlap with existing ones - overlaps don't affect already + * existing regions. @type is guaranteed to be minimal (all neighbouring + * compatible regions are merged) after the addition. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int __init_memblock memblock_add_region(struct memblock_type *type, + phys_addr_t base, phys_addr_t size, int nid) +{ + bool insert = false; + phys_addr_t obase = base; + phys_addr_t end = base + memblock_cap_size(base, &size); + int i, nr_new; - /* If the array is empty, special case, replace the fake - * filler region and return - */ - if ((type->cnt == 1) && (type->regions[0].size == 0)) { + /* special case for empty array */ + if (type->regions[0].size == 0) { + WARN_ON(type->cnt != 1 || type->total_size); type->regions[0].base = base; type->regions[0].size = size; + memblock_set_region_node(&type->regions[0], nid); + type->total_size = size; return 0; } - - new_block: - /* If we are out of space, we fail. It's too late to resize the array - * but then this shouldn't have happened in the first place. +repeat: + /* + * The following is executed twice. Once with %false @insert and + * then with %true. The first counts the number of regions needed + * to accomodate the new area. The second actually inserts them. */ - if (WARN_ON(type->cnt >= type->max)) - return -1; + base = obase; + nr_new = 0; - /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */ - for (i = type->cnt - 1; i >= 0; i--) { - if (base < type->regions[i].base) { - type->regions[i+1].base = type->regions[i].base; - type->regions[i+1].size = type->regions[i].size; - } else { - type->regions[i+1].base = base; - type->regions[i+1].size = size; - slot = i + 1; + for (i = 0; i < type->cnt; i++) { + struct memblock_region *rgn = &type->regions[i]; + phys_addr_t rbase = rgn->base; + phys_addr_t rend = rbase + rgn->size; + + if (rbase >= end) break; + if (rend <= base) + continue; + /* + * @rgn overlaps. If it separates the lower part of new + * area, insert that portion. + */ + if (rbase > base) { + nr_new++; + if (insert) + memblock_insert_region(type, i++, base, + rbase - base, nid); } + /* area below @rend is dealt with, forget about it */ + base = min(rend, end); } - if (base < type->regions[0].base) { - type->regions[0].base = base; - type->regions[0].size = size; - slot = 0; + + /* insert the remaining portion */ + if (base < end) { + nr_new++; + if (insert) + memblock_insert_region(type, i, base, end - base, nid); } - type->cnt++; - /* The array is full ? Try to resize it. If that fails, we undo - * our allocation and return an error + /* + * If this was the first round, resize array and repeat for actual + * insertions; otherwise, merge and return. */ - if (type->cnt == type->max && memblock_double_array(type)) { - BUG_ON(slot < 0); - memblock_remove_region(type, slot); - return -1; + if (!insert) { + while (type->cnt + nr_new > type->max) + if (memblock_double_array(type) < 0) + return -ENOMEM; + insert = true; + goto repeat; + } else { + memblock_merge_regions(type); + return 0; } - - return 0; } -long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) +int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, + int nid) { - return memblock_add_region(&memblock.memory, base, size); + return memblock_add_region(&memblock.memory, base, size, nid); +} +int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) +{ + return memblock_add_region(&memblock.memory, base, size, MAX_NUMNODES); } -static long __init_memblock __memblock_remove(struct memblock_type *type, - phys_addr_t base, phys_addr_t size) +/** + * memblock_isolate_range - isolate given range into disjoint memblocks + * @type: memblock type to isolate range for + * @base: base of range to isolate + * @size: size of range to isolate + * @start_rgn: out parameter for the start of isolated region + * @end_rgn: out parameter for the end of isolated region + * + * Walk @type and ensure that regions don't cross the boundaries defined by + * [@base,@base+@size). Crossing regions are split at the boundaries, + * which may create at most two more regions. The index of the first + * region inside the range is returned in *@start_rgn and end in *@end_rgn. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int __init_memblock memblock_isolate_range(struct memblock_type *type, + phys_addr_t base, phys_addr_t size, + int *start_rgn, int *end_rgn) { - phys_addr_t end = base + size; + phys_addr_t end = base + memblock_cap_size(base, &size); int i; - /* Walk through the array for collisions */ + *start_rgn = *end_rgn = 0; + + /* we'll create at most two more regions */ + while (type->cnt + 2 > type->max) + if (memblock_double_array(type) < 0) + return -ENOMEM; + for (i = 0; i < type->cnt; i++) { struct memblock_region *rgn = &type->regions[i]; - phys_addr_t rend = rgn->base + rgn->size; + phys_addr_t rbase = rgn->base; + phys_addr_t rend = rbase + rgn->size; - /* Nothing more to do, exit */ - if (rgn->base > end || rgn->size == 0) + if (rbase >= end) break; - - /* If we fully enclose the block, drop it */ - if (base <= rgn->base && end >= rend) { - memblock_remove_region(type, i--); + if (rend <= base) continue; - } - /* If we are fully enclosed within a block - * then we need to split it and we are done - */ - if (base > rgn->base && end < rend) { - rgn->size = base - rgn->base; - if (!memblock_add_region(type, end, rend - end)) - return 0; - /* Failure to split is bad, we at least - * restore the block before erroring + if (rbase < base) { + /* + * @rgn intersects from below. Split and continue + * to process the next region - the new top half. + */ + rgn->base = base; + rgn->size -= base - rbase; + type->total_size -= base - rbase; + memblock_insert_region(type, i, rbase, base - rbase, + memblock_get_region_node(rgn)); + } else if (rend > end) { + /* + * @rgn intersects from above. Split and redo the + * current region - the new bottom half. */ - rgn->size = rend - rgn->base; - WARN_ON(1); - return -1; - } - - /* Check if we need to trim the bottom of a block */ - if (rgn->base < end && rend > end) { - rgn->size -= end - rgn->base; rgn->base = end; - break; + rgn->size -= end - rbase; + type->total_size -= end - rbase; + memblock_insert_region(type, i--, rbase, end - rbase, + memblock_get_region_node(rgn)); + } else { + /* @rgn is fully contained, record it */ + if (!*end_rgn) + *start_rgn = i; + *end_rgn = i + 1; } + } - /* And check if we need to trim the top of a block */ - if (base < rend) - rgn->size -= rend - base; + return 0; +} - } +static int __init_memblock __memblock_remove(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) +{ + int start_rgn, end_rgn; + int i, ret; + + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; + + for (i = end_rgn - 1; i >= start_rgn; i--) + memblock_remove_region(type, i); return 0; } -long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) +int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { return __memblock_remove(&memblock.memory, base, size); } -long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) +int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) { + memblock_dbg(" memblock_free: [%#016llx-%#016llx] %pF\n", + (unsigned long long)base, + (unsigned long long)base + size, + (void *)_RET_IP_); + return __memblock_remove(&memblock.reserved, base, size); } -long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { struct memblock_type *_rgn = &memblock.reserved; + memblock_dbg("memblock_reserve: [%#016llx-%#016llx] %pF\n", + (unsigned long long)base, + (unsigned long long)base + size, + (void *)_RET_IP_); BUG_ON(0 == size); - return memblock_add_region(_rgn, base, size); + return memblock_add_region(_rgn, base, size, MAX_NUMNODES); } -phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) +/** + * __next_free_mem_range - next function for for_each_free_mem_range() + * @idx: pointer to u64 loop variable + * @nid: nid: node selector, %MAX_NUMNODES for all nodes + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @p_nid: ptr to int for nid of the range, can be %NULL + * + * Find the first free area from *@idx which matches @nid, fill the out + * parameters, and update *@idx for the next iteration. The lower 32bit of + * *@idx contains index into memory region and the upper 32bit indexes the + * areas before each reserved region. For example, if reserved regions + * look like the following, + * + * 0:[0-16), 1:[32-48), 2:[128-130) + * + * The upper 32bit indexes the following regions. + * + * 0:[0-0), 1:[16-32), 2:[48-128), 3:[130-MAX) + * + * As both region arrays are sorted, the function advances the two indices + * in lockstep and returns each intersection. + */ +void __init_memblock __next_free_mem_range(u64 *idx, int nid, + phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) { - phys_addr_t found; + struct memblock_type *mem = &memblock.memory; + struct memblock_type *rsv = &memblock.reserved; + int mi = *idx & 0xffffffff; + int ri = *idx >> 32; - /* We align the size to limit fragmentation. Without this, a lot of - * small allocs quickly eat up the whole reserve array on sparc - */ - size = memblock_align_up(size, align); + for ( ; mi < mem->cnt; mi++) { + struct memblock_region *m = &mem->regions[mi]; + phys_addr_t m_start = m->base; + phys_addr_t m_end = m->base + m->size; - found = memblock_find_base(size, align, 0, max_addr); - if (found != MEMBLOCK_ERROR && - !memblock_add_region(&memblock.reserved, found, size)) - return found; + /* only memory regions are associated with nodes, check it */ + if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) + continue; - return 0; + /* scan areas before each reservation for intersection */ + for ( ; ri < rsv->cnt + 1; ri++) { + struct memblock_region *r = &rsv->regions[ri]; + phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; + phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; + + /* if ri advanced past mi, break out to advance mi */ + if (r_start >= m_end) + break; + /* if the two regions intersect, we're done */ + if (m_start < r_end) { + if (out_start) + *out_start = max(m_start, r_start); + if (out_end) + *out_end = min(m_end, r_end); + if (out_nid) + *out_nid = memblock_get_region_node(m); + /* + * The region which ends first is advanced + * for the next iteration. + */ + if (m_end <= r_end) + mi++; + else + ri++; + *idx = (u32)mi | (u64)ri << 32; + return; + } + } + } + + /* signal end of iteration */ + *idx = ULLONG_MAX; } -phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) +/** + * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() + * @idx: pointer to u64 loop variable + * @nid: nid: node selector, %MAX_NUMNODES for all nodes + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * @p_nid: ptr to int for nid of the range, can be %NULL + * + * Reverse of __next_free_mem_range(). + */ +void __init_memblock __next_free_mem_range_rev(u64 *idx, int nid, + phys_addr_t *out_start, + phys_addr_t *out_end, int *out_nid) { - phys_addr_t alloc; + struct memblock_type *mem = &memblock.memory; + struct memblock_type *rsv = &memblock.reserved; + int mi = *idx & 0xffffffff; + int ri = *idx >> 32; - alloc = __memblock_alloc_base(size, align, max_addr); + if (*idx == (u64)ULLONG_MAX) { + mi = mem->cnt - 1; + ri = rsv->cnt; + } - if (alloc == 0) - panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", - (unsigned long long) size, (unsigned long long) max_addr); + for ( ; mi >= 0; mi--) { + struct memblock_region *m = &mem->regions[mi]; + phys_addr_t m_start = m->base; + phys_addr_t m_end = m->base + m->size; - return alloc; -} + /* only memory regions are associated with nodes, check it */ + if (nid != MAX_NUMNODES && nid != memblock_get_region_node(m)) + continue; -phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) -{ - return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); -} + /* scan areas before each reservation for intersection */ + for ( ; ri >= 0; ri--) { + struct memblock_region *r = &rsv->regions[ri]; + phys_addr_t r_start = ri ? r[-1].base + r[-1].size : 0; + phys_addr_t r_end = ri < rsv->cnt ? r->base : ULLONG_MAX; + + /* if ri advanced past mi, break out to advance mi */ + if (r_end <= m_start) + break; + /* if the two regions intersect, we're done */ + if (m_end > r_start) { + if (out_start) + *out_start = max(m_start, r_start); + if (out_end) + *out_end = min(m_end, r_end); + if (out_nid) + *out_nid = memblock_get_region_node(m); + + if (m_start >= r_start) + mi--; + else + ri--; + *idx = (u32)mi | (u64)ri << 32; + return; + } + } + } + *idx = ULLONG_MAX; +} +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * Additional node-local allocators. Search for node memory is bottom up - * and walks memblock regions within that node bottom-up as well, but allocation - * within an memblock region is top-down. XXX I plan to fix that at some stage - * - * WARNING: Only available after early_node_map[] has been populated, - * on some architectures, that is after all the calls to add_active_range() - * have been done to populate it. + * Common iterator interface used to define for_each_mem_range(). */ - -phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid) +void __init_memblock __next_mem_pfn_range(int *idx, int nid, + unsigned long *out_start_pfn, + unsigned long *out_end_pfn, int *out_nid) { -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP - /* - * This code originates from sparc which really wants use to walk by addresses - * and returns the nid. This is not very convenient for early_pfn_map[] users - * as the map isn't sorted yet, and it really wants to be walked by nid. - * - * For now, I implement the inefficient method below which walks the early - * map multiple times. Eventually we may want to use an ARCH config option - * to implement a completely different method for both case. - */ - unsigned long start_pfn, end_pfn; - int i; + struct memblock_type *type = &memblock.memory; + struct memblock_region *r; - for (i = 0; i < MAX_NUMNODES; i++) { - get_pfn_range_for_nid(i, &start_pfn, &end_pfn); - if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn)) + while (++*idx < type->cnt) { + r = &type->regions[*idx]; + + if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size)) continue; - *nid = i; - return min(end, PFN_PHYS(end_pfn)); + if (nid == MAX_NUMNODES || nid == r->nid) + break; + } + if (*idx >= type->cnt) { + *idx = -1; + return; } -#endif - *nid = 0; - return end; + if (out_start_pfn) + *out_start_pfn = PFN_UP(r->base); + if (out_end_pfn) + *out_end_pfn = PFN_DOWN(r->base + r->size); + if (out_nid) + *out_nid = r->nid; } -static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp, - phys_addr_t size, - phys_addr_t align, int nid) +/** + * memblock_set_node - set node ID on memblock regions + * @base: base of area to set node ID for + * @size: size of area to set node ID for + * @nid: node ID to set + * + * Set the nid of memblock memory regions in [@base,@base+@size) to @nid. + * Regions which cross the area boundaries are split as necessary. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, + int nid) { - phys_addr_t start, end; + struct memblock_type *type = &memblock.memory; + int start_rgn, end_rgn; + int i, ret; - start = mp->base; - end = start + mp->size; + ret = memblock_isolate_range(type, base, size, &start_rgn, &end_rgn); + if (ret) + return ret; - start = memblock_align_up(start, align); - while (start < end) { - phys_addr_t this_end; - int this_nid; + for (i = start_rgn; i < end_rgn; i++) + type->regions[i].nid = nid; - this_end = memblock_nid_range(start, end, &this_nid); - if (this_nid == nid) { - phys_addr_t ret = memblock_find_region(start, this_end, size, align); - if (ret != MEMBLOCK_ERROR && - !memblock_add_region(&memblock.reserved, ret, size)) - return ret; - } - start = this_end; - } + memblock_merge_regions(type); + return 0; +} +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ + +static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t max_addr, + int nid) +{ + phys_addr_t found; - return MEMBLOCK_ERROR; + found = memblock_find_in_range_node(0, max_addr, size, align, nid); + if (found && !memblock_reserve(found, size)) + return found; + + return 0; } phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { - struct memblock_type *mem = &memblock.memory; - int i; + return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); +} - BUG_ON(0 == size); +phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) +{ + return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES); +} - /* We align the size to limit fragmentation. Without this, a lot of - * small allocs quickly eat up the whole reserve array on sparc - */ - size = memblock_align_up(size, align); +phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) +{ + phys_addr_t alloc; - /* We do a bottom-up search for a region with the right - * nid since that's easier considering how memblock_nid_range() - * works - */ - for (i = 0; i < mem->cnt; i++) { - phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i], - size, align, nid); - if (ret != MEMBLOCK_ERROR) - return ret; - } + alloc = __memblock_alloc_base(size, align, max_addr); - return 0; + if (alloc == 0) + panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", + (unsigned long long) size, (unsigned long long) max_addr); + + return alloc; +} + +phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) +{ + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) @@ -613,7 +772,7 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i if (res) return res; - return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE); + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); } @@ -621,10 +780,9 @@ phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, i * Remaining API functions */ -/* You must call memblock_analyze() before this. */ phys_addr_t __init memblock_phys_mem_size(void) { - return memblock.memory_size; + return memblock.memory.total_size; } /* lowest address */ @@ -640,45 +798,28 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); } -/* You must call memblock_analyze() after this. */ -void __init memblock_enforce_memory_limit(phys_addr_t memory_limit) +void __init memblock_enforce_memory_limit(phys_addr_t limit) { unsigned long i; - phys_addr_t limit; - struct memblock_region *p; + phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; - if (!memory_limit) + if (!limit) return; - /* Truncate the memblock regions to satisfy the memory limit. */ - limit = memory_limit; + /* find out max address */ for (i = 0; i < memblock.memory.cnt; i++) { - if (limit > memblock.memory.regions[i].size) { - limit -= memblock.memory.regions[i].size; - continue; - } - - memblock.memory.regions[i].size = limit; - memblock.memory.cnt = i + 1; - break; - } - - memory_limit = memblock_end_of_DRAM(); + struct memblock_region *r = &memblock.memory.regions[i]; - /* And truncate any reserves above the limit also. */ - for (i = 0; i < memblock.reserved.cnt; i++) { - p = &memblock.reserved.regions[i]; - - if (p->base > memory_limit) - p->size = 0; - else if ((p->base + p->size) > memory_limit) - p->size = memory_limit - p->base; - - if (p->size == 0) { - memblock_remove_region(&memblock.reserved, i); - i--; + if (limit <= r->size) { + max_addr = r->base + limit; + break; } + limit -= r->size; } + + /* truncate both memory and reserved regions */ + __memblock_remove(&memblock.memory, max_addr, (phys_addr_t)ULLONG_MAX); + __memblock_remove(&memblock.reserved, max_addr, (phys_addr_t)ULLONG_MAX); } static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) @@ -712,16 +853,18 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) { int idx = memblock_search(&memblock.memory, base); + phys_addr_t end = base + memblock_cap_size(base, &size); if (idx == -1) return 0; return memblock.memory.regions[idx].base <= base && (memblock.memory.regions[idx].base + - memblock.memory.regions[idx].size) >= (base + size); + memblock.memory.regions[idx].size) >= end; } int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) { + memblock_cap_size(base, &size); return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; } @@ -731,86 +874,45 @@ void __init_memblock memblock_set_current_limit(phys_addr_t limit) memblock.current_limit = limit; } -static void __init_memblock memblock_dump(struct memblock_type *region, char *name) +static void __init_memblock memblock_dump(struct memblock_type *type, char *name) { unsigned long long base, size; int i; - pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); + pr_info(" %s.cnt = 0x%lx\n", name, type->cnt); - for (i = 0; i < region->cnt; i++) { - base = region->regions[i].base; - size = region->regions[i].size; - - pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n", - name, i, base, base + size - 1, size); + for (i = 0; i < type->cnt; i++) { + struct memblock_region *rgn = &type->regions[i]; + char nid_buf[32] = ""; + + base = rgn->base; + size = rgn->size; +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + if (memblock_get_region_node(rgn) != MAX_NUMNODES) + snprintf(nid_buf, sizeof(nid_buf), " on node %d", + memblock_get_region_node(rgn)); +#endif + pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes%s\n", + name, i, base, base + size - 1, size, nid_buf); } } -void __init_memblock memblock_dump_all(void) +void __init_memblock __memblock_dump_all(void) { - if (!memblock_debug) - return; - pr_info("MEMBLOCK configuration:\n"); - pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size); + pr_info(" memory size = %#llx reserved size = %#llx\n", + (unsigned long long)memblock.memory.total_size, + (unsigned long long)memblock.reserved.total_size); memblock_dump(&memblock.memory, "memory"); memblock_dump(&memblock.reserved, "reserved"); } -void __init memblock_analyze(void) +void __init memblock_allow_resize(void) { - int i; - - /* Check marker in the unused last array entry */ - WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base - != MEMBLOCK_INACTIVE); - WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base - != MEMBLOCK_INACTIVE); - - memblock.memory_size = 0; - - for (i = 0; i < memblock.memory.cnt; i++) - memblock.memory_size += memblock.memory.regions[i].size; - - /* We allow resizing from there */ memblock_can_resize = 1; } -void __init memblock_init(void) -{ - static int init_done __initdata = 0; - - if (init_done) - return; - init_done = 1; - - /* Hookup the initial arrays */ - memblock.memory.regions = memblock_memory_init_regions; - memblock.memory.max = INIT_MEMBLOCK_REGIONS; - memblock.reserved.regions = memblock_reserved_init_regions; - memblock.reserved.max = INIT_MEMBLOCK_REGIONS; - - /* Write a marker in the unused last array entry */ - memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; - memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; - - /* Create a dummy zero size MEMBLOCK which will get coalesced away later. - * This simplifies the memblock_add() code below... - */ - memblock.memory.regions[0].base = 0; - memblock.memory.regions[0].size = 0; - memblock.memory.cnt = 1; - - /* Ditto. */ - memblock.reserved.regions[0].base = 0; - memblock.reserved.regions[0].size = 0; - memblock.reserved.cnt = 1; - - memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE; -} - static int __init early_memblock(char *p) { if (p && strstr(p, "debug")) @@ -819,7 +921,7 @@ static int __init early_memblock(char *p) } early_param("memblock", early_memblock); -#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK) +#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK) static int memblock_debug_show(struct seq_file *m, void *private) { diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 7fa41b4a07b..24f0fc1a56d 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -41,14 +41,13 @@ static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, if (limit > memblock.current_limit) limit = memblock.current_limit; - addr = find_memory_core_early(nid, size, align, goal, limit); - - if (addr == MEMBLOCK_ERROR) + addr = memblock_find_in_range_node(goal, limit, size, align, nid); + if (!addr) return NULL; ptr = phys_to_virt(addr); memset(ptr, 0, size); - memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); + memblock_reserve(addr, size); /* * The min_count is set to 0 so that bootmem allocated blocks * are never reported as leaks. @@ -107,23 +106,27 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) __free_pages_bootmem(pfn_to_page(i), 0); } -unsigned long __init free_all_memory_core_early(int nodeid) +unsigned long __init free_low_memory_core_early(int nodeid) { - int i; - u64 start, end; unsigned long count = 0; - struct range *range = NULL; - int nr_range; - - nr_range = get_free_all_memory_range(&range, nodeid); - - for (i = 0; i < nr_range; i++) { - start = range[i].start; - end = range[i].end; - count += end - start; - __free_pages_memory(start, end); + phys_addr_t start, end; + u64 i; + + /* free reserved array temporarily so that it's treated as free area */ + memblock_free_reserved_regions(); + + for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = min_t(unsigned long, + PFN_DOWN(end), max_low_pfn); + if (start_pfn < end_pfn) { + __free_pages_memory(start_pfn, end_pfn); + count += end_pfn - start_pfn; + } } + /* put region array back? */ + memblock_reserve_reserved_regions(); return count; } @@ -137,7 +140,7 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); - /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ + /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ return 0; } @@ -155,7 +158,7 @@ unsigned long __init free_all_bootmem(void) * Use MAX_NUMNODES will make sure all ranges in early_node_map[] * will be used instead of only Node0 related */ - return free_all_memory_core_early(MAX_NUMNODES); + return free_low_memory_core_early(MAX_NUMNODES); } /** @@ -172,7 +175,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { kmemleak_free_part(__va(physaddr), size); - memblock_x86_free_range(physaddr, physaddr + size); + memblock_free(physaddr, size); } /** @@ -187,7 +190,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, void __init free_bootmem(unsigned long addr, unsigned long size) { kmemleak_free_part(__va(addr), size); - memblock_x86_free_range(addr, addr + size); + memblock_free(addr, size); } static void * __init ___alloc_bootmem_nopanic(unsigned long size, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b8ba3aebf6..bdc804c2d99 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -181,39 +181,17 @@ static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP - /* - * MAX_ACTIVE_REGIONS determines the maximum number of distinct - * ranges of memory (RAM) that may be registered with add_active_range(). - * Ranges passed to add_active_range() will be merged if possible - * so the number of times add_active_range() can be called is - * related to the number of nodes and the number of holes - */ - #ifdef CONFIG_MAX_ACTIVE_REGIONS - /* Allow an architecture to set MAX_ACTIVE_REGIONS to save memory */ - #define MAX_ACTIVE_REGIONS CONFIG_MAX_ACTIVE_REGIONS - #else - #if MAX_NUMNODES >= 32 - /* If there can be many nodes, allow up to 50 holes per node */ - #define MAX_ACTIVE_REGIONS (MAX_NUMNODES*50) - #else - /* By default, allow up to 256 distinct regions */ - #define MAX_ACTIVE_REGIONS 256 - #endif - #endif - - static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; - static int __meminitdata nr_nodemap_entries; - static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; - static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; - static unsigned long __initdata required_kernelcore; - static unsigned long __initdata required_movablecore; - static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; - - /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ - int movable_zone; - EXPORT_SYMBOL(movable_zone); -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; +static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; +static unsigned long __initdata required_kernelcore; +static unsigned long __initdata required_movablecore; +static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; + +/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ +int movable_zone; +EXPORT_SYMBOL(movable_zone); +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; @@ -706,10 +684,10 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) int loop; prefetchw(page); - for (loop = 0; loop < BITS_PER_LONG; loop++) { + for (loop = 0; loop < (1 << order); loop++) { struct page *p = &page[loop]; - if (loop + 1 < BITS_PER_LONG) + if (loop + 1 < (1 << order)) prefetchw(p + 1); __ClearPageReserved(p); set_page_count(p, 0); @@ -3737,35 +3715,7 @@ __meminit int init_currently_empty_zone(struct zone *zone, return 0; } -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP -/* - * Basic iterator support. Return the first range of PFNs for a node - * Note: nid == MAX_NUMNODES returns first region regardless of node - */ -static int __meminit first_active_region_index_in_nid(int nid) -{ - int i; - - for (i = 0; i < nr_nodemap_entries; i++) - if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) - return i; - - return -1; -} - -/* - * Basic iterator support. Return the next active range of PFNs for a node - * Note: nid == MAX_NUMNODES returns next region regardless of node - */ -static int __meminit next_active_region_index_in_nid(int index, int nid) -{ - for (index = index + 1; index < nr_nodemap_entries; index++) - if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) - return index; - - return -1; -} - +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. @@ -3775,15 +3725,12 @@ static int __meminit next_active_region_index_in_nid(int index, int nid) */ int __meminit __early_pfn_to_nid(unsigned long pfn) { - int i; - - for (i = 0; i < nr_nodemap_entries; i++) { - unsigned long start_pfn = early_node_map[i].start_pfn; - unsigned long end_pfn = early_node_map[i].end_pfn; + unsigned long start_pfn, end_pfn; + int i, nid; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) if (start_pfn <= pfn && pfn < end_pfn) - return early_node_map[i].nid; - } + return nid; /* This is a memory hole */ return -1; } @@ -3812,11 +3759,6 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) } #endif -/* Basic iterator support to walk early_node_map[] */ -#define for_each_active_range_index_in_nid(i, nid) \ - for (i = first_active_region_index_in_nid(nid); i != -1; \ - i = next_active_region_index_in_nid(i, nid)) - /** * free_bootmem_with_active_regions - Call free_bootmem_node for each active range * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. @@ -3826,122 +3768,34 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) * add_active_ranges() contain no holes and may be freed, this * this function may be used instead of calling free_bootmem() manually. */ -void __init free_bootmem_with_active_regions(int nid, - unsigned long max_low_pfn) -{ - int i; - - for_each_active_range_index_in_nid(i, nid) { - unsigned long size_pages = 0; - unsigned long end_pfn = early_node_map[i].end_pfn; - - if (early_node_map[i].start_pfn >= max_low_pfn) - continue; - - if (end_pfn > max_low_pfn) - end_pfn = max_low_pfn; - - size_pages = end_pfn - early_node_map[i].start_pfn; - free_bootmem_node(NODE_DATA(early_node_map[i].nid), - PFN_PHYS(early_node_map[i].start_pfn), - size_pages << PAGE_SHIFT); - } -} - -#ifdef CONFIG_HAVE_MEMBLOCK -/* - * Basic iterator support. Return the last range of PFNs for a node - * Note: nid == MAX_NUMNODES returns last region regardless of node - */ -static int __meminit last_active_region_index_in_nid(int nid) +void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) { - int i; - - for (i = nr_nodemap_entries - 1; i >= 0; i--) - if (nid == MAX_NUMNODES || early_node_map[i].nid == nid) - return i; - - return -1; -} - -/* - * Basic iterator support. Return the previous active range of PFNs for a node - * Note: nid == MAX_NUMNODES returns next region regardless of node - */ -static int __meminit previous_active_region_index_in_nid(int index, int nid) -{ - for (index = index - 1; index >= 0; index--) - if (nid == MAX_NUMNODES || early_node_map[index].nid == nid) - return index; - - return -1; -} - -#define for_each_active_range_index_in_nid_reverse(i, nid) \ - for (i = last_active_region_index_in_nid(nid); i != -1; \ - i = previous_active_region_index_in_nid(i, nid)) - -u64 __init find_memory_core_early(int nid, u64 size, u64 align, - u64 goal, u64 limit) -{ - int i; - - /* Need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid_reverse(i, nid) { - u64 addr; - u64 ei_start, ei_last; - u64 final_start, final_end; - - ei_last = early_node_map[i].end_pfn; - ei_last <<= PAGE_SHIFT; - ei_start = early_node_map[i].start_pfn; - ei_start <<= PAGE_SHIFT; - - final_start = max(ei_start, goal); - final_end = min(ei_last, limit); - - if (final_start >= final_end) - continue; - - addr = memblock_find_in_range(final_start, final_end, size, align); + unsigned long start_pfn, end_pfn; + int i, this_nid; - if (addr == MEMBLOCK_ERROR) - continue; + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) { + start_pfn = min(start_pfn, max_low_pfn); + end_pfn = min(end_pfn, max_low_pfn); - return addr; + if (start_pfn < end_pfn) + free_bootmem_node(NODE_DATA(this_nid), + PFN_PHYS(start_pfn), + (end_pfn - start_pfn) << PAGE_SHIFT); } - - return MEMBLOCK_ERROR; } -#endif int __init add_from_early_node_map(struct range *range, int az, int nr_range, int nid) { + unsigned long start_pfn, end_pfn; int i; - u64 start, end; /* need to go over early_node_map to find out good range for node */ - for_each_active_range_index_in_nid(i, nid) { - start = early_node_map[i].start_pfn; - end = early_node_map[i].end_pfn; - nr_range = add_range(range, az, nr_range, start, end); - } + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) + nr_range = add_range(range, az, nr_range, start_pfn, end_pfn); return nr_range; } -void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) -{ - int i; - int ret; - - for_each_active_range_index_in_nid(i, nid) { - ret = work_fn(early_node_map[i].start_pfn, - early_node_map[i].end_pfn, data); - if (ret) - break; - } -} /** * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. @@ -3952,12 +3806,11 @@ void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) */ void __init sparse_memory_present_with_active_regions(int nid) { - int i; + unsigned long start_pfn, end_pfn; + int i, this_nid; - for_each_active_range_index_in_nid(i, nid) - memory_present(early_node_map[i].nid, - early_node_map[i].start_pfn, - early_node_map[i].end_pfn); + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) + memory_present(this_nid, start_pfn, end_pfn); } /** @@ -3974,13 +3827,15 @@ void __init sparse_memory_present_with_active_regions(int nid) void __meminit get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) { + unsigned long this_start_pfn, this_end_pfn; int i; + *start_pfn = -1UL; *end_pfn = 0; - for_each_active_range_index_in_nid(i, nid) { - *start_pfn = min(*start_pfn, early_node_map[i].start_pfn); - *end_pfn = max(*end_pfn, early_node_map[i].end_pfn); + for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) { + *start_pfn = min(*start_pfn, this_start_pfn); + *end_pfn = max(*end_pfn, this_end_pfn); } if (*start_pfn == -1UL) @@ -4083,46 +3938,16 @@ unsigned long __meminit __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { - int i = 0; - unsigned long prev_end_pfn = 0, hole_pages = 0; - unsigned long start_pfn; - - /* Find the end_pfn of the first active range of pfns in the node */ - i = first_active_region_index_in_nid(nid); - if (i == -1) - return 0; - - prev_end_pfn = min(early_node_map[i].start_pfn, range_end_pfn); - - /* Account for ranges before physical memory on this node */ - if (early_node_map[i].start_pfn > range_start_pfn) - hole_pages = prev_end_pfn - range_start_pfn; - - /* Find all holes for the zone within the node */ - for (; i != -1; i = next_active_region_index_in_nid(i, nid)) { - - /* No need to continue if prev_end_pfn is outside the zone */ - if (prev_end_pfn >= range_end_pfn) - break; - - /* Make sure the end of the zone is not within the hole */ - start_pfn = min(early_node_map[i].start_pfn, range_end_pfn); - prev_end_pfn = max(prev_end_pfn, range_start_pfn); + unsigned long nr_absent = range_end_pfn - range_start_pfn; + unsigned long start_pfn, end_pfn; + int i; - /* Update the hole size cound and move on */ - if (start_pfn > range_start_pfn) { - BUG_ON(prev_end_pfn > start_pfn); - hole_pages += start_pfn - prev_end_pfn; - } - prev_end_pfn = early_node_map[i].end_pfn; + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { + start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn); + end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn); + nr_absent -= end_pfn - start_pfn; } - - /* Account for ranges past physical memory on this node */ - if (range_end_pfn > prev_end_pfn) - hole_pages += range_end_pfn - - max(range_start_pfn, prev_end_pfn); - - return hole_pages; + return nr_absent; } /** @@ -4143,14 +3968,14 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, unsigned long *ignored) { + unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; + unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); - zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type], - node_start_pfn); - zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type], - node_end_pfn); + zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); + zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); adjust_zone_range_for_zone_movable(nid, zone_type, node_start_pfn, node_end_pfn, @@ -4158,7 +3983,7 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn); } -#else +#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, unsigned long *zones_size) @@ -4176,7 +4001,7 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, return zholes_size[zone_type]; } -#endif +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) @@ -4399,10 +4224,10 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP if (page_to_pfn(mem_map) != pgdat->node_start_pfn) mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif #endif /* CONFIG_FLAT_NODE_MEM_MAP */ @@ -4427,7 +4252,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat, zones_size, zholes_size); } -#ifdef CONFIG_ARCH_POPULATES_NODE_MAP +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #if MAX_NUMNODES > 1 /* @@ -4449,170 +4274,6 @@ static inline void setup_nr_node_ids(void) #endif /** - * add_active_range - Register a range of PFNs backed by physical memory - * @nid: The node ID the range resides on - * @start_pfn: The start PFN of the available physical memory - * @end_pfn: The end PFN of the available physical memory - * - * These ranges are stored in an early_node_map[] and later used by - * free_area_init_nodes() to calculate zone sizes and holes. If the - * range spans a memory hole, it is up to the architecture to ensure - * the memory is not freed by the bootmem allocator. If possible - * the range being registered will be merged with existing ranges. - */ -void __init add_active_range(unsigned int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - int i; - - mminit_dprintk(MMINIT_TRACE, "memory_register", - "Entering add_active_range(%d, %#lx, %#lx) " - "%d entries of %d used\n", - nid, start_pfn, end_pfn, - nr_nodemap_entries, MAX_ACTIVE_REGIONS); - - mminit_validate_memmodel_limits(&start_pfn, &end_pfn); - - /* Merge with existing active regions if possible */ - for (i = 0; i < nr_nodemap_entries; i++) { - if (early_node_map[i].nid != nid) - continue; - - /* Skip if an existing region covers this new one */ - if (start_pfn >= early_node_map[i].start_pfn && - end_pfn <= early_node_map[i].end_pfn) - return; - - /* Merge forward if suitable */ - if (start_pfn <= early_node_map[i].end_pfn && - end_pfn > early_node_map[i].end_pfn) { - early_node_map[i].end_pfn = end_pfn; - return; - } - - /* Merge backward if suitable */ - if (start_pfn < early_node_map[i].start_pfn && - end_pfn >= early_node_map[i].start_pfn) { - early_node_map[i].start_pfn = start_pfn; - return; - } - } - - /* Check that early_node_map is large enough */ - if (i >= MAX_ACTIVE_REGIONS) { - printk(KERN_CRIT "More than %d memory regions, truncating\n", - MAX_ACTIVE_REGIONS); - return; - } - - early_node_map[i].nid = nid; - early_node_map[i].start_pfn = start_pfn; - early_node_map[i].end_pfn = end_pfn; - nr_nodemap_entries = i + 1; -} - -/** - * remove_active_range - Shrink an existing registered range of PFNs - * @nid: The node id the range is on that should be shrunk - * @start_pfn: The new PFN of the range - * @end_pfn: The new PFN of the range - * - * i386 with NUMA use alloc_remap() to store a node_mem_map on a local node. - * The map is kept near the end physical page range that has already been - * registered. This function allows an arch to shrink an existing registered - * range. - */ -void __init remove_active_range(unsigned int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - int i, j; - int removed = 0; - - printk(KERN_DEBUG "remove_active_range (%d, %lu, %lu)\n", - nid, start_pfn, end_pfn); - - /* Find the old active region end and shrink */ - for_each_active_range_index_in_nid(i, nid) { - if (early_node_map[i].start_pfn >= start_pfn && - early_node_map[i].end_pfn <= end_pfn) { - /* clear it */ - early_node_map[i].start_pfn = 0; - early_node_map[i].end_pfn = 0; - removed = 1; - continue; - } - if (early_node_map[i].start_pfn < start_pfn && - early_node_map[i].end_pfn > start_pfn) { - unsigned long temp_end_pfn = early_node_map[i].end_pfn; - early_node_map[i].end_pfn = start_pfn; - if (temp_end_pfn > end_pfn) - add_active_range(nid, end_pfn, temp_end_pfn); - continue; - } - if (early_node_map[i].start_pfn >= start_pfn && - early_node_map[i].end_pfn > end_pfn && - early_node_map[i].start_pfn < end_pfn) { - early_node_map[i].start_pfn = end_pfn; - continue; - } - } - - if (!removed) - return; - - /* remove the blank ones */ - for (i = nr_nodemap_entries - 1; i > 0; i--) { - if (early_node_map[i].nid != nid) - continue; - if (early_node_map[i].end_pfn) - continue; - /* we found it, get rid of it */ - for (j = i; j < nr_nodemap_entries - 1; j++) - memcpy(&early_node_map[j], &early_node_map[j+1], - sizeof(early_node_map[j])); - j = nr_nodemap_entries - 1; - memset(&early_node_map[j], 0, sizeof(early_node_map[j])); - nr_nodemap_entries--; - } -} - -/** - * remove_all_active_ranges - Remove all currently registered regions - * - * During discovery, it may be found that a table like SRAT is invalid - * and an alternative discovery method must be used. This function removes - * all currently registered regions. - */ -void __init remove_all_active_ranges(void) -{ - memset(early_node_map, 0, sizeof(early_node_map)); - nr_nodemap_entries = 0; -} - -/* Compare two active node_active_regions */ -static int __init cmp_node_active_region(const void *a, const void *b) -{ - struct node_active_region *arange = (struct node_active_region *)a; - struct node_active_region *brange = (struct node_active_region *)b; - - /* Done this way to avoid overflows */ - if (arange->start_pfn > brange->start_pfn) - return 1; - if (arange->start_pfn < brange->start_pfn) - return -1; - - return 0; -} - -/* sort the node_map by start_pfn */ -void __init sort_node_map(void) -{ - sort(early_node_map, (size_t)nr_nodemap_entries, - sizeof(struct node_active_region), - cmp_node_active_region, NULL); -} - -/** * node_map_pfn_alignment - determine the maximum internode alignment * * This function should be called after node map is populated and sorted. @@ -4634,15 +4295,11 @@ void __init sort_node_map(void) unsigned long __init node_map_pfn_alignment(void) { unsigned long accl_mask = 0, last_end = 0; + unsigned long start, end, mask; int last_nid = -1; - int i; - - for_each_active_range_index_in_nid(i, MAX_NUMNODES) { - int nid = early_node_map[i].nid; - unsigned long start = early_node_map[i].start_pfn; - unsigned long end = early_node_map[i].end_pfn; - unsigned long mask; + int i, nid; + for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) { if (!start || last_nid < 0 || last_nid == nid) { last_nid = nid; last_end = end; @@ -4669,12 +4326,12 @@ unsigned long __init node_map_pfn_alignment(void) /* Find the lowest pfn for a node */ static unsigned long __init find_min_pfn_for_node(int nid) { - int i; unsigned long min_pfn = ULONG_MAX; + unsigned long start_pfn; + int i; - /* Assuming a sorted map, the first range found has the starting pfn */ - for_each_active_range_index_in_nid(i, nid) - min_pfn = min(min_pfn, early_node_map[i].start_pfn); + for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL) + min_pfn = min(min_pfn, start_pfn); if (min_pfn == ULONG_MAX) { printk(KERN_WARNING @@ -4703,15 +4360,16 @@ unsigned long __init find_min_pfn_with_active_regions(void) */ static unsigned long __init early_calculate_totalpages(void) { - int i; unsigned long totalpages = 0; + unsigned long start_pfn, end_pfn; + int i, nid; + + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { + unsigned long pages = end_pfn - start_pfn; - for (i = 0; i < nr_nodemap_entries; i++) { - unsigned long pages = early_node_map[i].end_pfn - - early_node_map[i].start_pfn; totalpages += pages; if (pages) - node_set_state(early_node_map[i].nid, N_HIGH_MEMORY); + node_set_state(nid, N_HIGH_MEMORY); } return totalpages; } @@ -4766,6 +4424,8 @@ restart: /* Spread kernelcore memory as evenly as possible throughout nodes */ kernelcore_node = required_kernelcore / usable_nodes; for_each_node_state(nid, N_HIGH_MEMORY) { + unsigned long start_pfn, end_pfn; + /* * Recalculate kernelcore_node if the division per node * now exceeds what is necessary to satisfy the requested @@ -4782,13 +4442,10 @@ restart: kernelcore_remaining = kernelcore_node; /* Go through each range of PFNs within this node */ - for_each_active_range_index_in_nid(i, nid) { - unsigned long start_pfn, end_pfn; + for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) { unsigned long size_pages; - start_pfn = max(early_node_map[i].start_pfn, - zone_movable_pfn[nid]); - end_pfn = early_node_map[i].end_pfn; + start_pfn = max(start_pfn, zone_movable_pfn[nid]); if (start_pfn >= end_pfn) continue; @@ -4890,11 +4547,8 @@ static void check_for_regular_memory(pg_data_t *pgdat) */ void __init free_area_init_nodes(unsigned long *max_zone_pfn) { - unsigned long nid; - int i; - - /* Sort early_node_map as initialisation assumes it is sorted */ - sort_node_map(); + unsigned long start_pfn, end_pfn; + int i, nid; /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, @@ -4941,11 +4595,9 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) } /* Print out the early_node_map[] */ - printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); - for (i = 0; i < nr_nodemap_entries; i++) - printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, - early_node_map[i].start_pfn, - early_node_map[i].end_pfn); + printk("Early memory PFN ranges\n"); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) + printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn); /* Initialise every node */ mminit_verify_pageflags_layout(); @@ -4998,7 +4650,7 @@ static int __init cmdline_parse_movablecore(char *p) early_param("kernelcore", cmdline_parse_kernelcore); early_param("movablecore", cmdline_parse_movablecore); -#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ +#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ /** * set_dma_reserve - set the specified number of pages reserved in the first zone diff --git a/mm/slub.c b/mm/slub.c index ed3334d9b6d..09ccee8fb58 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -368,7 +368,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page VM_BUG_ON(!irqs_disabled()); #ifdef CONFIG_CMPXCHG_DOUBLE if (s->flags & __CMPXCHG_DOUBLE) { - if (cmpxchg_double(&page->freelist, + if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, freelist_new, counters_new)) return 1; @@ -402,7 +402,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, { #ifdef CONFIG_CMPXCHG_DOUBLE if (s->flags & __CMPXCHG_DOUBLE) { - if (cmpxchg_double(&page->freelist, + if (cmpxchg_double(&page->freelist, &page->counters, freelist_old, counters_old, freelist_new, counters_new)) return 1; diff --git a/net/socket.c b/net/socket.c index 2877647f347..a0053750e37 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2883,7 +2883,7 @@ static int bond_ioctl(struct net *net, unsigned int cmd, return dev_ioctl(net, cmd, uifr); default: - return -EINVAL; + return -ENOIOCTLCMD; } } @@ -3210,20 +3210,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, return sock_do_ioctl(net, sock, cmd, arg); } - /* Prevent warning from compat_sys_ioctl, these always - * result in -EINVAL in the native case anyway. */ - switch (cmd) { - case SIOCRTMSG: - case SIOCGIFCOUNT: - case SIOCSRARP: - case SIOCGRARP: - case SIOCDRARP: - case SIOCSIFLINK: - case SIOCGIFSLAVE: - case SIOCSIFSLAVE: - return -EINVAL; - } - return -ENOIOCTLCMD; } diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt index fe6762ed56b..c89f9e1453f 100644 --- a/tools/perf/Documentation/perf-annotate.txt +++ b/tools/perf/Documentation/perf-annotate.txt @@ -22,7 +22,7 @@ OPTIONS ------- -i:: --input=:: - Input file name. (default: perf.data) + Input file name. (default: perf.data unless stdin is a fifo) -d:: --dsos=<dso[,dso...]>:: @@ -66,7 +66,7 @@ OPTIONS used. This interfaces starts by centering on the line with more samples, TAB/UNTAB cycles through the lines with more samples. --c:: +-C:: --cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can be provided as a comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. Default is to report samples on all diff --git a/tools/perf/Documentation/perf-buildid-list.txt b/tools/perf/Documentation/perf-buildid-list.txt index cc22325ffd1..25c52efcc7f 100644 --- a/tools/perf/Documentation/perf-buildid-list.txt +++ b/tools/perf/Documentation/perf-buildid-list.txt @@ -26,7 +26,7 @@ OPTIONS Show only DSOs with hits. -i:: --input=:: - Input file name. (default: perf.data) + Input file name. (default: perf.data unless stdin is a fifo) -f:: --force:: Don't do ownership validation. diff --git a/tools/perf/Documentation/perf-evlist.txt b/tools/perf/Documentation/perf-evlist.txt index 0cada9e053d..0507ec7bad7 100644 --- a/tools/perf/Documentation/perf-evlist.txt +++ b/tools/perf/Documentation/perf-evlist.txt @@ -18,7 +18,7 @@ OPTIONS ------- -i:: --input=:: - Input file name. (default: perf.data) + Input file name. (default: perf.data unless stdin is a fifo) SEE ALSO -------- diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt index a52fcde894c..7c8fbbf3f61 100644 --- a/tools/perf/Documentation/perf-kmem.txt +++ b/tools/perf/Documentation/perf-kmem.txt @@ -23,7 +23,7 @@ OPTIONS ------- -i <file>:: --input=<file>:: - Select the input file (default: perf.data) + Select the input file (default: perf.data unless stdin is a fifo) --caller:: Show per-callsite statistics diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt index 4a26a2f3a6a..d6b2a4f2108 100644 --- a/tools/perf/Documentation/perf-lock.txt +++ b/tools/perf/Documentation/perf-lock.txt @@ -29,7 +29,7 @@ COMMON OPTIONS -i:: --input=<file>:: - Input file name. + Input file name. (default: perf.data unless stdin is a fifo) -v:: --verbose:: diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt index 5a520f82529..2937f7e14bb 100644 --- a/tools/perf/Documentation/perf-record.txt +++ b/tools/perf/Documentation/perf-record.txt @@ -89,7 +89,7 @@ OPTIONS -m:: --mmap-pages=:: - Number of mmap data pages. + Number of mmap data pages. Must be a power of two. -g:: --call-graph:: diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index 212f24d672e..9b430e98712 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -19,7 +19,7 @@ OPTIONS ------- -i:: --input=:: - Input file name. (default: perf.data) + Input file name. (default: perf.data unless stdin is a fifo) -v:: --verbose:: @@ -39,7 +39,7 @@ OPTIONS -T:: --threads:: Show per-thread event counters --C:: +-c:: --comms=:: Only consider symbols in these comms. CSV that understands file://filename entries. @@ -80,9 +80,10 @@ OPTIONS --dump-raw-trace:: Dump raw trace in ASCII. --g [type,min,order]:: +-g [type,min[,limit],order]:: --call-graph:: - Display call chains using type, min percent threshold and order. + Display call chains using type, min percent threshold, optional print + limit and order. type can be either: - flat: single column, linear exposure of call chains. - graph: use a graph tree, displaying absolute overhead rates. @@ -128,7 +129,7 @@ OPTIONS --symfs=<directory>:: Look for files with symbols relative to this directory. --c:: +-C:: --cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can be provided as a comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. Default is to report samples on all diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt index 5b212b57f70..8ff4df95695 100644 --- a/tools/perf/Documentation/perf-sched.txt +++ b/tools/perf/Documentation/perf-sched.txt @@ -40,7 +40,7 @@ OPTIONS ------- -i:: --input=<file>:: - Input file name. (default: perf.data) + Input file name. (default: perf.data unless stdin is a fifo) -v:: --verbose:: diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt index dec87ecb530..2f6cef43da2 100644 --- a/tools/perf/Documentation/perf-script.txt +++ b/tools/perf/Documentation/perf-script.txt @@ -106,7 +106,7 @@ OPTIONS -i:: --input=:: - Input file name. + Input file name. (default: perf.data unless stdin is a fifo) -d:: --debug-mode:: @@ -182,12 +182,17 @@ OPTIONS --hide-call-graph:: When printing symbols do not display call chain. --c:: +-C:: --cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can be provided as a comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2. Default is to report samples on all CPUs. +-c:: +--comms=:: + Only display events for these comms. CSV that understands + file://filename entries. + -I:: --show-info:: Display extended information about the perf.data file. This adds diff --git a/tools/perf/Documentation/perf-test.txt b/tools/perf/Documentation/perf-test.txt index 2c3b462f64b..b24ac40fcd5 100644 --- a/tools/perf/Documentation/perf-test.txt +++ b/tools/perf/Documentation/perf-test.txt @@ -8,13 +8,19 @@ perf-test - Runs sanity tests. SYNOPSIS -------- [verse] -'perf test <options>' +'perf test [<options>] [{list <test-name-fragment>|[<test-name-fragments>|<test-numbers>]}]' DESCRIPTION ----------- This command does assorted sanity tests, initially through linked routines but also will look for a directory with more tests in the form of scripts. +To get a list of available tests use 'perf test list', specifying a test name +fragment will show all tests that have it. + +To run just specific tests, inform test name fragments or the numbers obtained +from 'perf test list'. + OPTIONS ------- -v:: diff --git a/tools/perf/Documentation/perf-timechart.txt b/tools/perf/Documentation/perf-timechart.txt index d7b79e2ba2a..1632b0efc75 100644 --- a/tools/perf/Documentation/perf-timechart.txt +++ b/tools/perf/Documentation/perf-timechart.txt @@ -27,7 +27,7 @@ OPTIONS Select the output file (default: output.svg) -i:: --input=:: - Select the input file (default: perf.data) + Select the input file (default: perf.data unless stdin is a fifo) -w:: --width=:: Select the width of the SVG file (default: 1000) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index b98e3075646..ac86d67b636 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -278,6 +278,7 @@ LIB_H += util/strbuf.h LIB_H += util/strlist.h LIB_H += util/strfilter.h LIB_H += util/svghelper.h +LIB_H += util/tool.h LIB_H += util/run-command.h LIB_H += util/sigchain.h LIB_H += util/symbol.h diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 46b4c24f338..214ba7f9f57 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -27,32 +27,32 @@ #include "util/sort.h" #include "util/hist.h" #include "util/session.h" +#include "util/tool.h" #include <linux/bitmap.h> -static char const *input_name = "perf.data"; - -static bool force, use_tui, use_stdio; - -static bool full_paths; - -static bool print_line; - -static const char *sym_hist_filter; - -static const char *cpu_list; -static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); +struct perf_annotate { + struct perf_tool tool; + char const *input_name; + bool force, use_tui, use_stdio; + bool full_paths; + bool print_line; + const char *sym_hist_filter; + const char *cpu_list; + DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); +}; -static int perf_evlist__add_sample(struct perf_evlist *evlist, - struct perf_sample *sample, - struct perf_evsel *evsel, - struct addr_location *al) +static int perf_evsel__add_sample(struct perf_evsel *evsel, + struct perf_sample *sample, + struct addr_location *al, + struct perf_annotate *ann) { struct hist_entry *he; int ret; - if (sym_hist_filter != NULL && - (al->sym == NULL || strcmp(sym_hist_filter, al->sym->name) != 0)) { + if (ann->sym_hist_filter != NULL && + (al->sym == NULL || + strcmp(ann->sym_hist_filter, al->sym->name) != 0)) { /* We're only interested in a symbol named sym_hist_filter */ if (al->sym != NULL) { rb_erase(&al->sym->rb_node, @@ -69,8 +69,7 @@ static int perf_evlist__add_sample(struct perf_evlist *evlist, ret = 0; if (he->ms.sym != NULL) { struct annotation *notes = symbol__annotation(he->ms.sym); - if (notes->src == NULL && - symbol__alloc_hist(he->ms.sym, evlist->nr_entries) < 0) + if (notes->src == NULL && symbol__alloc_hist(he->ms.sym) < 0) return -ENOMEM; ret = hist_entry__inc_addr_samples(he, evsel->idx, al->addr); @@ -81,25 +80,26 @@ static int perf_evlist__add_sample(struct perf_evlist *evlist, return ret; } -static int process_sample_event(union perf_event *event, +static int process_sample_event(struct perf_tool *tool, + union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel, - struct perf_session *session) + struct machine *machine) { + struct perf_annotate *ann = container_of(tool, struct perf_annotate, tool); struct addr_location al; - if (perf_event__preprocess_sample(event, session, &al, sample, + if (perf_event__preprocess_sample(event, machine, &al, sample, symbol__annotate_init) < 0) { pr_warning("problem processing %d event, skipping it.\n", event->header.type); return -1; } - if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) + if (ann->cpu_list && !test_bit(sample->cpu, ann->cpu_bitmap)) return 0; - if (!al.filtered && - perf_evlist__add_sample(session->evlist, sample, evsel, &al)) { + if (!al.filtered && perf_evsel__add_sample(evsel, sample, &al, ann)) { pr_warning("problem incrementing symbol count, " "skipping event\n"); return -1; @@ -108,14 +108,15 @@ static int process_sample_event(union perf_event *event, return 0; } -static int hist_entry__tty_annotate(struct hist_entry *he, int evidx) +static int hist_entry__tty_annotate(struct hist_entry *he, int evidx, + struct perf_annotate *ann) { return symbol__tty_annotate(he->ms.sym, he->ms.map, evidx, - print_line, full_paths, 0, 0); + ann->print_line, ann->full_paths, 0, 0); } static void hists__find_annotations(struct hists *self, int evidx, - int nr_events) + struct perf_annotate *ann) { struct rb_node *nd = rb_first(&self->entries), *next; int key = K_RIGHT; @@ -138,8 +139,7 @@ find_next: } if (use_browser > 0) { - key = hist_entry__tui_annotate(he, evidx, nr_events, - NULL, NULL, 0); + key = hist_entry__tui_annotate(he, evidx, NULL, NULL, 0); switch (key) { case K_RIGHT: next = rb_next(nd); @@ -154,7 +154,7 @@ find_next: if (next != NULL) nd = next; } else { - hist_entry__tty_annotate(he, evidx); + hist_entry__tty_annotate(he, evidx, ann); nd = rb_next(nd); /* * Since we have a hist_entry per IP for the same @@ -167,33 +167,26 @@ find_next: } } -static struct perf_event_ops event_ops = { - .sample = process_sample_event, - .mmap = perf_event__process_mmap, - .comm = perf_event__process_comm, - .fork = perf_event__process_task, - .ordered_samples = true, - .ordering_requires_timestamps = true, -}; - -static int __cmd_annotate(void) +static int __cmd_annotate(struct perf_annotate *ann) { int ret; struct perf_session *session; struct perf_evsel *pos; u64 total_nr_samples; - session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops); + session = perf_session__new(ann->input_name, O_RDONLY, + ann->force, false, &ann->tool); if (session == NULL) return -ENOMEM; - if (cpu_list) { - ret = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap); + if (ann->cpu_list) { + ret = perf_session__cpu_bitmap(session, ann->cpu_list, + ann->cpu_bitmap); if (ret) goto out_delete; } - ret = perf_session__process_events(session, &event_ops); + ret = perf_session__process_events(session, &ann->tool); if (ret) goto out_delete; @@ -217,13 +210,12 @@ static int __cmd_annotate(void) total_nr_samples += nr_samples; hists__collapse_resort(hists); hists__output_resort(hists); - hists__find_annotations(hists, pos->idx, - session->evlist->nr_entries); + hists__find_annotations(hists, pos->idx, ann); } } if (total_nr_samples == 0) { - ui__warning("The %s file has no samples!\n", input_name); + ui__warning("The %s file has no samples!\n", session->filename); goto out_delete; } out_delete: @@ -247,29 +239,41 @@ static const char * const annotate_usage[] = { NULL }; -static const struct option options[] = { - OPT_STRING('i', "input", &input_name, "file", +int cmd_annotate(int argc, const char **argv, const char *prefix __used) +{ + struct perf_annotate annotate = { + .tool = { + .sample = process_sample_event, + .mmap = perf_event__process_mmap, + .comm = perf_event__process_comm, + .fork = perf_event__process_task, + .ordered_samples = true, + .ordering_requires_timestamps = true, + }, + }; + const struct option options[] = { + OPT_STRING('i', "input", &annotate.input_name, "file", "input file name"), OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]", "only consider symbols in these dsos"), - OPT_STRING('s', "symbol", &sym_hist_filter, "symbol", + OPT_STRING('s', "symbol", &annotate.sym_hist_filter, "symbol", "symbol to annotate"), - OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), + OPT_BOOLEAN('f', "force", &annotate.force, "don't complain, do it"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"), - OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"), - OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"), + OPT_BOOLEAN(0, "tui", &annotate.use_tui, "Use the TUI interface"), + OPT_BOOLEAN(0, "stdio", &annotate.use_stdio, "Use the stdio interface"), OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name, "file", "vmlinux pathname"), OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules, "load module symbols - WARNING: use only with -k and LIVE kernel"), - OPT_BOOLEAN('l', "print-line", &print_line, + OPT_BOOLEAN('l', "print-line", &annotate.print_line, "print matching source lines (may be slow)"), - OPT_BOOLEAN('P', "full-paths", &full_paths, + OPT_BOOLEAN('P', "full-paths", &annotate.full_paths, "Don't shorten the displayed pathnames"), - OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"), + OPT_STRING('C', "cpu", &annotate.cpu_list, "cpu", "list of cpus to profile"), OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", "Look for files with symbols relative to this directory"), OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src, @@ -279,15 +283,13 @@ static const struct option options[] = { OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style", "Specify disassembler style (e.g. -M intel for intel syntax)"), OPT_END() -}; + }; -int cmd_annotate(int argc, const char **argv, const char *prefix __used) -{ argc = parse_options(argc, argv, options, annotate_usage, 0); - if (use_stdio) + if (annotate.use_stdio) use_browser = 0; - else if (use_tui) + else if (annotate.use_tui) use_browser = 1; setup_browser(true); @@ -308,7 +310,7 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used) if (argc > 1) usage_with_options(annotate_usage, options); - sym_hist_filter = argv[0]; + annotate.sym_hist_filter = argv[0]; } if (field_sep && *field_sep == '.') { @@ -316,5 +318,5 @@ int cmd_annotate(int argc, const char **argv, const char *prefix __used) return -1; } - return __cmd_annotate(); + return __cmd_annotate(&annotate); } diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index cb690a65bf0..52480467e9f 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -18,7 +18,7 @@ #include <libelf.h> -static char const *input_name = "perf.data"; +static const char *input_name; static bool force; static bool show_kernel; static bool with_hits; @@ -39,24 +39,6 @@ static const struct option options[] = { OPT_END() }; -static int perf_session__list_build_ids(void) -{ - struct perf_session *session; - - session = perf_session__new(input_name, O_RDONLY, force, false, - &build_id__mark_dso_hit_ops); - if (session == NULL) - return -1; - - if (with_hits) - perf_session__process_events(session, &build_id__mark_dso_hit_ops); - - perf_session__fprintf_dsos_buildid(session, stdout, with_hits); - - perf_session__delete(session); - return 0; -} - static int sysfs__fprintf_build_id(FILE *fp) { u8 kallsyms_build_id[BUILD_ID_SIZE]; @@ -85,17 +67,36 @@ static int filename__fprintf_build_id(const char *name, FILE *fp) return fprintf(fp, "%s\n", sbuild_id); } -static int __cmd_buildid_list(void) +static int perf_session__list_build_ids(void) { - if (show_kernel) - return sysfs__fprintf_build_id(stdout); + struct perf_session *session; elf_version(EV_CURRENT); + + session = perf_session__new(input_name, O_RDONLY, force, false, + &build_id__mark_dso_hit_ops); + if (session == NULL) + return -1; + /* - * See if this is an ELF file first: - */ - if (filename__fprintf_build_id(input_name, stdout)) - return 0; + * See if this is an ELF file first: + */ + if (filename__fprintf_build_id(session->filename, stdout)) + goto out; + + if (with_hits) + perf_session__process_events(session, &build_id__mark_dso_hit_ops); + + perf_session__fprintf_dsos_buildid(session, stdout, with_hits); +out: + perf_session__delete(session); + return 0; +} + +static int __cmd_buildid_list(void) +{ + if (show_kernel) + return sysfs__fprintf_build_id(stdout); return perf_session__list_build_ids(); } diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c index b39f3a1ee7d..4f19513d7dd 100644 --- a/tools/perf/builtin-diff.c +++ b/tools/perf/builtin-diff.c @@ -9,7 +9,9 @@ #include "util/debug.h" #include "util/event.h" #include "util/hist.h" +#include "util/evsel.h" #include "util/session.h" +#include "util/tool.h" #include "util/sort.h" #include "util/symbol.h" #include "util/util.h" @@ -30,14 +32,15 @@ static int hists__add_entry(struct hists *self, return -ENOMEM; } -static int diff__process_sample_event(union perf_event *event, +static int diff__process_sample_event(struct perf_tool *tool __used, + union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel __used, - struct perf_session *session) + struct machine *machine) { struct addr_location al; - if (perf_event__preprocess_sample(event, session, &al, sample, NULL) < 0) { + if (perf_event__preprocess_sample(event, machine, &al, sample, NULL) < 0) { pr_warning("problem processing %d event, skipping it.\n", event->header.type); return -1; @@ -46,16 +49,16 @@ static int diff__process_sample_event(union perf_event *event, if (al.filtered || al.sym == NULL) return 0; - if (hists__add_entry(&session->hists, &al, sample->period)) { + if (hists__add_entry(&evsel->hists, &al, sample->period)) { pr_warning("problem incrementing symbol period, skipping event\n"); return -1; } - session->hists.stats.total_period += sample->period; + evsel->hists.stats.total_period += sample->period; return 0; } -static struct perf_event_ops event_ops = { +static struct perf_tool perf_diff = { .sample = diff__process_sample_event, .mmap = perf_event__process_mmap, .comm = perf_event__process_comm, @@ -145,13 +148,13 @@ static int __cmd_diff(void) int ret, i; struct perf_session *session[2]; - session[0] = perf_session__new(input_old, O_RDONLY, force, false, &event_ops); - session[1] = perf_session__new(input_new, O_RDONLY, force, false, &event_ops); + session[0] = perf_session__new(input_old, O_RDONLY, force, false, &perf_diff); + session[1] = perf_session__new(input_new, O_RDONLY, force, false, &perf_diff); if (session[0] == NULL || session[1] == NULL) return -ENOMEM; for (i = 0; i < 2; ++i) { - ret = perf_session__process_events(session[i], &event_ops); + ret = perf_session__process_events(session[i], &perf_diff); if (ret) goto out_delete; } diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c index 4c5e9e04a41..26760322c4f 100644 --- a/tools/perf/builtin-evlist.c +++ b/tools/perf/builtin-evlist.c @@ -15,7 +15,7 @@ #include "util/parse-options.h" #include "util/session.h" -static char const *input_name = "perf.data"; +static const char *input_name; static int __cmd_evlist(void) { diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c index 8dfc12bb119..09c106193e6 100644 --- a/tools/perf/builtin-inject.c +++ b/tools/perf/builtin-inject.c @@ -9,6 +9,7 @@ #include "perf.h" #include "util/session.h" +#include "util/tool.h" #include "util/debug.h" #include "util/parse-options.h" @@ -16,8 +17,9 @@ static char const *input_name = "-"; static bool inject_build_ids; -static int perf_event__repipe_synth(union perf_event *event, - struct perf_session *session __used) +static int perf_event__repipe_synth(struct perf_tool *tool __used, + union perf_event *event, + struct machine *machine __used) { uint32_t size; void *buf = event; @@ -36,41 +38,70 @@ static int perf_event__repipe_synth(union perf_event *event, return 0; } -static int perf_event__repipe(union perf_event *event, +static int perf_event__repipe_op2_synth(struct perf_tool *tool, + union perf_event *event, + struct perf_session *session __used) +{ + return perf_event__repipe_synth(tool, event, NULL); +} + +static int perf_event__repipe_event_type_synth(struct perf_tool *tool, + union perf_event *event) +{ + return perf_event__repipe_synth(tool, event, NULL); +} + +static int perf_event__repipe_tracing_data_synth(union perf_event *event, + struct perf_session *session __used) +{ + return perf_event__repipe_synth(NULL, event, NULL); +} + +static int perf_event__repipe_attr(union perf_event *event, + struct perf_evlist **pevlist __used) +{ + return perf_event__repipe_synth(NULL, event, NULL); +} + +static int perf_event__repipe(struct perf_tool *tool, + union perf_event *event, struct perf_sample *sample __used, - struct perf_session *session) + struct machine *machine) { - return perf_event__repipe_synth(event, session); + return perf_event__repipe_synth(tool, event, machine); } -static int perf_event__repipe_sample(union perf_event *event, +static int perf_event__repipe_sample(struct perf_tool *tool, + union perf_event *event, struct perf_sample *sample __used, struct perf_evsel *evsel __used, - struct perf_session *session) + struct machine *machine) { - return perf_event__repipe_synth(event, session); + return perf_event__repipe_synth(tool, event, machine); } -static int perf_event__repipe_mmap(union perf_event *event, +static int perf_event__repipe_mmap(struct perf_tool *tool, + union perf_event *event, struct perf_sample *sample, - struct perf_session *session) + struct machine *machine) { int err; - err = perf_event__process_mmap(event, sample, session); - perf_event__repipe(event, sample, session); + err = perf_event__process_mmap(tool, event, sample, machine); + perf_event__repipe(tool, event, sample, machine); return err; } -static int perf_event__repipe_task(union perf_event *event, +static int perf_event__repipe_task(struct perf_tool *tool, + union perf_event *event, struct perf_sample *sample, - struct perf_session *session) + struct machine *machine) { int err; - err = perf_event__process_task(event, sample, session); - perf_event__repipe(event, sample, session); + err = perf_event__process_task(tool, event, sample, machine); + perf_event__repipe(tool, event, sample, machine); return err; } @@ -80,7 +111,7 @@ static int perf_event__repipe_tracing_data(union perf_event *event, { int err; - perf_event__repipe_synth(event, session); + perf_event__repipe_synth(NULL, event, NULL); err = perf_event__process_tracing_data(event, session); return err; @@ -100,10 +131,10 @@ static int dso__read_build_id(struct dso *self) return -1; } -static int dso__inject_build_id(struct dso *self, struct perf_session *session) +static int dso__inject_build_id(struct dso *self, struct perf_tool *tool, + struct machine *machine) { u16 misc = PERF_RECORD_MISC_USER; - struct machine *machine; int err; if (dso__read_build_id(self) < 0) { @@ -111,17 +142,11 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session) return -1; } - machine = perf_session__find_host_machine(session); - if (machine == NULL) { - pr_err("Can't find machine for session\n"); - return -1; - } - if (self->kernel) misc = PERF_RECORD_MISC_KERNEL; - err = perf_event__synthesize_build_id(self, misc, perf_event__repipe, - machine, session); + err = perf_event__synthesize_build_id(tool, self, misc, perf_event__repipe, + machine); if (err) { pr_err("Can't synthesize build_id event for %s\n", self->long_name); return -1; @@ -130,10 +155,11 @@ static int dso__inject_build_id(struct dso *self, struct perf_session *session) return 0; } -static int perf_event__inject_buildid(union perf_event *event, +static int perf_event__inject_buildid(struct perf_tool *tool, + union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel __used, - struct perf_session *session) + struct machine *machine) { struct addr_location al; struct thread *thread; @@ -141,21 +167,21 @@ static int perf_event__inject_buildid(union perf_event *event, cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - thread = perf_session__findnew(session, event->ip.pid); + thread = machine__findnew_thread(machine, event->ip.pid); if (thread == NULL) { pr_err("problem processing %d event, skipping it.\n", event->header.type); goto repipe; } - thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION, - event->ip.pid, event->ip.ip, &al); + thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION, + event->ip.ip, &al); if (al.map != NULL) { if (!al.map->dso->hit) { al.map->dso->hit = 1; if (map__load(al.map, NULL) >= 0) { - dso__inject_build_id(al.map->dso, session); + dso__inject_build_id(al.map->dso, tool, machine); /* * If this fails, too bad, let the other side * account this as unresolved. @@ -168,24 +194,24 @@ static int perf_event__inject_buildid(union perf_event *event, } repipe: - perf_event__repipe(event, sample, session); + perf_event__repipe(tool, event, sample, machine); return 0; } -struct perf_event_ops inject_ops = { +struct perf_tool perf_inject = { .sample = perf_event__repipe_sample, .mmap = perf_event__repipe, .comm = perf_event__repipe, .fork = perf_event__repipe, .exit = perf_event__repipe, .lost = perf_event__repipe, - .read = perf_event__repipe, + .read = perf_event__repipe_sample, .throttle = perf_event__repipe, .unthrottle = perf_event__repipe, - .attr = perf_event__repipe_synth, - .event_type = perf_event__repipe_synth, - .tracing_data = perf_event__repipe_synth, - .build_id = perf_event__repipe_synth, + .attr = perf_event__repipe_attr, + .event_type = perf_event__repipe_event_type_synth, + .tracing_data = perf_event__repipe_tracing_data_synth, + .build_id = perf_event__repipe_op2_synth, }; extern volatile int session_done; @@ -203,17 +229,17 @@ static int __cmd_inject(void) signal(SIGINT, sig_handler); if (inject_build_ids) { - inject_ops.sample = perf_event__inject_buildid; - inject_ops.mmap = perf_event__repipe_mmap; - inject_ops.fork = perf_event__repipe_task; - inject_ops.tracing_data = perf_event__repipe_tracing_data; + perf_inject.sample = perf_event__inject_buildid; + perf_inject.mmap = perf_event__repipe_mmap; + perf_inject.fork = perf_event__repipe_task; + perf_inject.tracing_data = perf_event__repipe_tracing_data; } - session = perf_session__new(input_name, O_RDONLY, false, true, &inject_ops); + session = perf_session__new(input_name, O_RDONLY, false, true, &perf_inject); if (session == NULL) return -ENOMEM; - ret = perf_session__process_events(session, &inject_ops); + ret = perf_session__process_events(session, &perf_inject); perf_session__delete(session); diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 225e963df10..fe1ad8f2196 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -7,6 +7,7 @@ #include "util/thread.h" #include "util/header.h" #include "util/session.h" +#include "util/tool.h" #include "util/parse-options.h" #include "util/trace-event.h" @@ -18,7 +19,7 @@ struct alloc_stat; typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *); -static char const *input_name = "perf.data"; +static const char *input_name; static int alloc_flag; static int caller_flag; @@ -303,12 +304,13 @@ static void process_raw_event(union perf_event *raw_event __used, void *data, } } -static int process_sample_event(union perf_event *event, +static int process_sample_event(struct perf_tool *tool __used, + union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel __used, - struct perf_session *session) + struct machine *machine) { - struct thread *thread = perf_session__findnew(session, event->ip.pid); + struct thread *thread = machine__findnew_thread(machine, event->ip.pid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", @@ -324,7 +326,7 @@ static int process_sample_event(union perf_event *event, return 0; } -static struct perf_event_ops event_ops = { +static struct perf_tool perf_kmem = { .sample = process_sample_event, .comm = perf_event__process_comm, .ordered_samples = true, @@ -483,7 +485,7 @@ static int __cmd_kmem(void) { int err = -EINVAL; struct perf_session *session = perf_session__new(input_name, O_RDONLY, - 0, false, &event_ops); + 0, false, &perf_kmem); if (session == NULL) return -ENOMEM; @@ -494,7 +496,7 @@ static int __cmd_kmem(void) goto out_delete; setup_pager(); - err = perf_session__process_events(session, &event_ops); + err = perf_session__process_events(session, &perf_kmem); if (err != 0) goto out_delete; sort_result(); diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 34d1e853829..032324a76b8 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -38,7 +38,7 @@ static const struct option kvm_options[] = { OPT_BOOLEAN(0, "guest", &perf_guest, "Collect guest os data"), OPT_BOOLEAN(0, "host", &perf_host, - "Collect guest os data"), + "Collect host os data"), OPT_STRING(0, "guestmount", &symbol_conf.guestmount, "directory", "guest mount directory under which every guest os" " instance has a subdir"), diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 899080ace26..2296c391d0f 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -12,6 +12,7 @@ #include "util/debug.h" #include "util/session.h" +#include "util/tool.h" #include <sys/types.h> #include <sys/prctl.h> @@ -325,7 +326,7 @@ alloc_failed: die("memory allocation failed\n"); } -static char const *input_name = "perf.data"; +static const char *input_name; struct raw_event_sample { u32 size; @@ -845,12 +846,13 @@ static void dump_info(void) die("Unknown type of information\n"); } -static int process_sample_event(union perf_event *event, +static int process_sample_event(struct perf_tool *tool __used, + union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel __used, - struct perf_session *s) + struct machine *machine) { - struct thread *thread = perf_session__findnew(s, sample->tid); + struct thread *thread = machine__findnew_thread(machine, sample->tid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", @@ -863,7 +865,7 @@ static int process_sample_event(union perf_event *event, return 0; } -static struct perf_event_ops eops = { +static struct perf_tool eops = { .sample = process_sample_event, .comm = perf_event__process_comm, .ordered_samples = true, diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index 710ae3d0a48..59d43abfbfe 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -46,7 +46,6 @@ #define DEFAULT_VAR_FILTER "!__k???tab_* & !__crc_*" #define DEFAULT_FUNC_FILTER "!_*" -#define MAX_PATH_LEN 256 /* Session management structure */ static struct { diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 6ab58cc99d5..0abfb18b911 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -22,6 +22,7 @@ #include "util/evsel.h" #include "util/debug.h" #include "util/session.h" +#include "util/tool.h" #include "util/symbol.h" #include "util/cpumap.h" #include "util/thread_map.h" @@ -35,55 +36,36 @@ enum write_mode_t { WRITE_APPEND }; -static u64 user_interval = ULLONG_MAX; -static u64 default_interval = 0; - -static unsigned int page_size; -static unsigned int mmap_pages = UINT_MAX; -static unsigned int user_freq = UINT_MAX; -static int freq = 1000; -static int output; -static int pipe_output = 0; -static const char *output_name = NULL; -static bool group = false; -static int realtime_prio = 0; -static bool nodelay = false; -static bool raw_samples = false; -static bool sample_id_all_avail = true; -static bool system_wide = false; -static pid_t target_pid = -1; -static pid_t target_tid = -1; -static pid_t child_pid = -1; -static bool no_inherit = false; -static enum write_mode_t write_mode = WRITE_FORCE; -static bool call_graph = false; -static bool inherit_stat = false; -static bool no_samples = false; -static bool sample_address = false; -static bool sample_time = false; -static bool no_buildid = false; -static bool no_buildid_cache = false; -static struct perf_evlist *evsel_list; - -static long samples = 0; -static u64 bytes_written = 0; - -static int file_new = 1; -static off_t post_processing_offset; - -static struct perf_session *session; -static const char *cpu_list; -static const char *progname; - -static void advance_output(size_t size) +struct perf_record { + struct perf_tool tool; + struct perf_record_opts opts; + u64 bytes_written; + const char *output_name; + struct perf_evlist *evlist; + struct perf_session *session; + const char *progname; + int output; + unsigned int page_size; + int realtime_prio; + enum write_mode_t write_mode; + bool no_buildid; + bool no_buildid_cache; + bool force; + bool file_new; + bool append_file; + long samples; + off_t post_processing_offset; +}; + +static void advance_output(struct perf_record *rec, size_t size) { - bytes_written += size; + rec->bytes_written += size; } -static void write_output(void *buf, size_t size) +static void write_output(struct perf_record *rec, void *buf, size_t size) { while (size) { - int ret = write(output, buf, size); + int ret = write(rec->output, buf, size); if (ret < 0) die("failed to write"); @@ -91,30 +73,33 @@ static void write_output(void *buf, size_t size) size -= ret; buf += ret; - bytes_written += ret; + rec->bytes_written += ret; } } -static int process_synthesized_event(union perf_event *event, +static int process_synthesized_event(struct perf_tool *tool, + union perf_event *event, struct perf_sample *sample __used, - struct perf_session *self __used) + struct machine *machine __used) { - write_output(event, event->header.size); + struct perf_record *rec = container_of(tool, struct perf_record, tool); + write_output(rec, event, event->header.size); return 0; } -static void mmap_read(struct perf_mmap *md) +static void perf_record__mmap_read(struct perf_record *rec, + struct perf_mmap *md) { unsigned int head = perf_mmap__read_head(md); unsigned int old = md->prev; - unsigned char *data = md->base + page_size; + unsigned char *data = md->base + rec->page_size; unsigned long size; void *buf; if (old == head) return; - samples++; + rec->samples++; size = head - old; @@ -123,14 +108,14 @@ static void mmap_read(struct perf_mmap *md) size = md->mask + 1 - (old & md->mask); old += size; - write_output(buf, size); + write_output(rec, buf, size); } buf = &data[old & md->mask]; size = head - old; old += size; - write_output(buf, size); + write_output(rec, buf, size); md->prev = old; perf_mmap__write_tail(md, old); @@ -149,17 +134,18 @@ static void sig_handler(int sig) signr = sig; } -static void sig_atexit(void) +static void perf_record__sig_exit(int exit_status __used, void *arg) { + struct perf_record *rec = arg; int status; - if (child_pid > 0) { + if (rec->evlist->workload.pid > 0) { if (!child_finished) - kill(child_pid, SIGTERM); + kill(rec->evlist->workload.pid, SIGTERM); wait(&status); if (WIFSIGNALED(status)) - psignal(WTERMSIG(status), progname); + psignal(WTERMSIG(status), rec->progname); } if (signr == -1 || signr == SIGUSR1) @@ -169,78 +155,6 @@ static void sig_atexit(void) kill(getpid(), signr); } -static void config_attr(struct perf_evsel *evsel, struct perf_evlist *evlist) -{ - struct perf_event_attr *attr = &evsel->attr; - int track = !evsel->idx; /* only the first counter needs these */ - - attr->disabled = 1; - attr->inherit = !no_inherit; - attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | - PERF_FORMAT_TOTAL_TIME_RUNNING | - PERF_FORMAT_ID; - - attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID; - - if (evlist->nr_entries > 1) - attr->sample_type |= PERF_SAMPLE_ID; - - /* - * We default some events to a 1 default interval. But keep - * it a weak assumption overridable by the user. - */ - if (!attr->sample_period || (user_freq != UINT_MAX && - user_interval != ULLONG_MAX)) { - if (freq) { - attr->sample_type |= PERF_SAMPLE_PERIOD; - attr->freq = 1; - attr->sample_freq = freq; - } else { - attr->sample_period = default_interval; - } - } - - if (no_samples) - attr->sample_freq = 0; - - if (inherit_stat) - attr->inherit_stat = 1; - - if (sample_address) { - attr->sample_type |= PERF_SAMPLE_ADDR; - attr->mmap_data = track; - } - - if (call_graph) - attr->sample_type |= PERF_SAMPLE_CALLCHAIN; - - if (system_wide) - attr->sample_type |= PERF_SAMPLE_CPU; - - if (sample_id_all_avail && - (sample_time || system_wide || !no_inherit || cpu_list)) - attr->sample_type |= PERF_SAMPLE_TIME; - - if (raw_samples) { - attr->sample_type |= PERF_SAMPLE_TIME; - attr->sample_type |= PERF_SAMPLE_RAW; - attr->sample_type |= PERF_SAMPLE_CPU; - } - - if (nodelay) { - attr->watermark = 0; - attr->wakeup_events = 1; - } - - attr->mmap = track; - attr->comm = track; - - if (target_pid == -1 && target_tid == -1 && !system_wide) { - attr->disabled = 1; - attr->enable_on_exec = 1; - } -} - static bool perf_evlist__equal(struct perf_evlist *evlist, struct perf_evlist *other) { @@ -260,15 +174,17 @@ static bool perf_evlist__equal(struct perf_evlist *evlist, return true; } -static void open_counters(struct perf_evlist *evlist) +static void perf_record__open(struct perf_record *rec) { struct perf_evsel *pos, *first; - - if (evlist->cpus->map[0] < 0) - no_inherit = true; + struct perf_evlist *evlist = rec->evlist; + struct perf_session *session = rec->session; + struct perf_record_opts *opts = &rec->opts; first = list_entry(evlist->entries.next, struct perf_evsel, node); + perf_evlist__config_attrs(evlist, opts); + list_for_each_entry(pos, &evlist->entries, node) { struct perf_event_attr *attr = &pos->attr; struct xyarray *group_fd = NULL; @@ -286,29 +202,27 @@ static void open_counters(struct perf_evlist *evlist) */ bool time_needed = attr->sample_type & PERF_SAMPLE_TIME; - if (group && pos != first) + if (opts->group && pos != first) group_fd = first->fd; - - config_attr(pos, evlist); retry_sample_id: - attr->sample_id_all = sample_id_all_avail ? 1 : 0; + attr->sample_id_all = opts->sample_id_all_avail ? 1 : 0; try_again: - if (perf_evsel__open(pos, evlist->cpus, evlist->threads, group, - group_fd) < 0) { + if (perf_evsel__open(pos, evlist->cpus, evlist->threads, + opts->group, group_fd) < 0) { int err = errno; if (err == EPERM || err == EACCES) { ui__error_paranoid(); exit(EXIT_FAILURE); - } else if (err == ENODEV && cpu_list) { + } else if (err == ENODEV && opts->cpu_list) { die("No such device - did you specify" " an out-of-range profile CPU?\n"); - } else if (err == EINVAL && sample_id_all_avail) { + } else if (err == EINVAL && opts->sample_id_all_avail) { /* * Old kernel, no attr->sample_id_type_all field */ - sample_id_all_avail = false; - if (!sample_time && !raw_samples && !time_needed) + opts->sample_id_all_avail = false; + if (!opts->sample_time && !opts->raw_samples && !time_needed) attr->sample_type &= ~PERF_SAMPLE_TIME; goto retry_sample_id; @@ -358,10 +272,20 @@ try_again: exit(-1); } - if (perf_evlist__mmap(evlist, mmap_pages, false) < 0) + if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) { + if (errno == EPERM) + die("Permission error mapping pages.\n" + "Consider increasing " + "/proc/sys/kernel/perf_event_mlock_kb,\n" + "or try again with a smaller value of -m/--mmap_pages.\n" + "(current value: %d)\n", opts->mmap_pages); + else if (!is_power_of_2(opts->mmap_pages)) + die("--mmap_pages/-m value must be a power of two."); + die("failed to mmap with %d (%s)\n", errno, strerror(errno)); + } - if (file_new) + if (rec->file_new) session->evlist = evlist; else { if (!perf_evlist__equal(session->evlist, evlist)) { @@ -373,29 +297,32 @@ try_again: perf_session__update_sample_type(session); } -static int process_buildids(void) +static int process_buildids(struct perf_record *rec) { - u64 size = lseek(output, 0, SEEK_CUR); + u64 size = lseek(rec->output, 0, SEEK_CUR); if (size == 0) return 0; - session->fd = output; - return __perf_session__process_events(session, post_processing_offset, - size - post_processing_offset, + rec->session->fd = rec->output; + return __perf_session__process_events(rec->session, rec->post_processing_offset, + size - rec->post_processing_offset, size, &build_id__mark_dso_hit_ops); } -static void atexit_header(void) +static void perf_record__exit(int status __used, void *arg) { - if (!pipe_output) { - session->header.data_size += bytes_written; - - if (!no_buildid) - process_buildids(); - perf_session__write_header(session, evsel_list, output, true); - perf_session__delete(session); - perf_evlist__delete(evsel_list); + struct perf_record *rec = arg; + + if (!rec->opts.pipe_output) { + rec->session->header.data_size += rec->bytes_written; + + if (!rec->no_buildid) + process_buildids(rec); + perf_session__write_header(rec->session, rec->evlist, + rec->output, true); + perf_session__delete(rec->session); + perf_evlist__delete(rec->evlist); symbol__exit(); } } @@ -403,7 +330,7 @@ static void atexit_header(void) static void perf_event__synthesize_guest_os(struct machine *machine, void *data) { int err; - struct perf_session *psession = data; + struct perf_tool *tool = data; if (machine__is_host(machine)) return; @@ -416,8 +343,8 @@ static void perf_event__synthesize_guest_os(struct machine *machine, void *data) *method is used to avoid symbol missing when the first addr is *in module instead of in guest kernel. */ - err = perf_event__synthesize_modules(process_synthesized_event, - psession, machine); + err = perf_event__synthesize_modules(tool, process_synthesized_event, + machine); if (err < 0) pr_err("Couldn't record guest kernel [%d]'s reference" " relocation symbol.\n", machine->pid); @@ -426,12 +353,11 @@ static void perf_event__synthesize_guest_os(struct machine *machine, void *data) * We use _stext for guest kernel because guest kernel's /proc/kallsyms * have no _text sometimes. */ - err = perf_event__synthesize_kernel_mmap(process_synthesized_event, - psession, machine, "_text"); + err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, + machine, "_text"); if (err < 0) - err = perf_event__synthesize_kernel_mmap(process_synthesized_event, - psession, machine, - "_stext"); + err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, + machine, "_stext"); if (err < 0) pr_err("Couldn't record guest kernel [%d]'s reference" " relocation symbol.\n", machine->pid); @@ -442,73 +368,71 @@ static struct perf_event_header finished_round_event = { .type = PERF_RECORD_FINISHED_ROUND, }; -static void mmap_read_all(void) +static void perf_record__mmap_read_all(struct perf_record *rec) { int i; - for (i = 0; i < evsel_list->nr_mmaps; i++) { - if (evsel_list->mmap[i].base) - mmap_read(&evsel_list->mmap[i]); + for (i = 0; i < rec->evlist->nr_mmaps; i++) { + if (rec->evlist->mmap[i].base) + perf_record__mmap_read(rec, &rec->evlist->mmap[i]); } - if (perf_header__has_feat(&session->header, HEADER_TRACE_INFO)) - write_output(&finished_round_event, sizeof(finished_round_event)); + if (perf_header__has_feat(&rec->session->header, HEADER_TRACE_INFO)) + write_output(rec, &finished_round_event, sizeof(finished_round_event)); } -static int __cmd_record(int argc, const char **argv) +static int __cmd_record(struct perf_record *rec, int argc, const char **argv) { struct stat st; int flags; - int err; + int err, output; unsigned long waking = 0; - int child_ready_pipe[2], go_pipe[2]; const bool forks = argc > 0; - char buf; struct machine *machine; + struct perf_tool *tool = &rec->tool; + struct perf_record_opts *opts = &rec->opts; + struct perf_evlist *evsel_list = rec->evlist; + const char *output_name = rec->output_name; + struct perf_session *session; - progname = argv[0]; + rec->progname = argv[0]; - page_size = sysconf(_SC_PAGE_SIZE); + rec->page_size = sysconf(_SC_PAGE_SIZE); - atexit(sig_atexit); + on_exit(perf_record__sig_exit, rec); signal(SIGCHLD, sig_handler); signal(SIGINT, sig_handler); signal(SIGUSR1, sig_handler); - if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) { - perror("failed to create pipes"); - exit(-1); - } - if (!output_name) { if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode)) - pipe_output = 1; + opts->pipe_output = true; else - output_name = "perf.data"; + rec->output_name = output_name = "perf.data"; } if (output_name) { if (!strcmp(output_name, "-")) - pipe_output = 1; + opts->pipe_output = true; else if (!stat(output_name, &st) && st.st_size) { - if (write_mode == WRITE_FORCE) { + if (rec->write_mode == WRITE_FORCE) { char oldname[PATH_MAX]; snprintf(oldname, sizeof(oldname), "%s.old", output_name); unlink(oldname); rename(output_name, oldname); } - } else if (write_mode == WRITE_APPEND) { - write_mode = WRITE_FORCE; + } else if (rec->write_mode == WRITE_APPEND) { + rec->write_mode = WRITE_FORCE; } } flags = O_CREAT|O_RDWR; - if (write_mode == WRITE_APPEND) - file_new = 0; + if (rec->write_mode == WRITE_APPEND) + rec->file_new = 0; else flags |= O_TRUNC; - if (pipe_output) + if (opts->pipe_output) output = STDOUT_FILENO; else output = open(output_name, flags, S_IRUSR | S_IWUSR); @@ -517,17 +441,21 @@ static int __cmd_record(int argc, const char **argv) exit(-1); } + rec->output = output; + session = perf_session__new(output_name, O_WRONLY, - write_mode == WRITE_FORCE, false, NULL); + rec->write_mode == WRITE_FORCE, false, NULL); if (session == NULL) { pr_err("Not enough memory for reading perf file header\n"); return -1; } - if (!no_buildid) + rec->session = session; + + if (!rec->no_buildid) perf_header__set_feat(&session->header, HEADER_BUILD_ID); - if (!file_new) { + if (!rec->file_new) { err = perf_session__read_header(session, output); if (err < 0) goto out_delete_session; @@ -549,94 +477,57 @@ static int __cmd_record(int argc, const char **argv) perf_header__set_feat(&session->header, HEADER_NUMA_TOPOLOGY); perf_header__set_feat(&session->header, HEADER_CPUID); - /* 512 kiB: default amount of unprivileged mlocked memory */ - if (mmap_pages == UINT_MAX) - mmap_pages = (512 * 1024) / page_size; - if (forks) { - child_pid = fork(); - if (child_pid < 0) { - perror("failed to fork"); - exit(-1); - } - - if (!child_pid) { - if (pipe_output) - dup2(2, 1); - close(child_ready_pipe[0]); - close(go_pipe[1]); - fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC); - - /* - * Do a dummy execvp to get the PLT entry resolved, - * so we avoid the resolver overhead on the real - * execvp call. - */ - execvp("", (char **)argv); - - /* - * Tell the parent we're ready to go - */ - close(child_ready_pipe[1]); - - /* - * Wait until the parent tells us to go. - */ - if (read(go_pipe[0], &buf, 1) == -1) - perror("unable to read pipe"); - - execvp(argv[0], (char **)argv); - - perror(argv[0]); - kill(getppid(), SIGUSR1); - exit(-1); - } - - if (!system_wide && target_tid == -1 && target_pid == -1) - evsel_list->threads->map[0] = child_pid; - - close(child_ready_pipe[1]); - close(go_pipe[0]); - /* - * wait for child to settle - */ - if (read(child_ready_pipe[0], &buf, 1) == -1) { - perror("unable to read pipe"); - exit(-1); + err = perf_evlist__prepare_workload(evsel_list, opts, argv); + if (err < 0) { + pr_err("Couldn't run the workload!\n"); + goto out_delete_session; } - close(child_ready_pipe[0]); } - open_counters(evsel_list); + perf_record__open(rec); /* - * perf_session__delete(session) will be called at atexit_header() + * perf_session__delete(session) will be called at perf_record__exit() */ - atexit(atexit_header); + on_exit(perf_record__exit, rec); - if (pipe_output) { + if (opts->pipe_output) { err = perf_header__write_pipe(output); if (err < 0) return err; - } else if (file_new) { + } else if (rec->file_new) { err = perf_session__write_header(session, evsel_list, output, false); if (err < 0) return err; } - post_processing_offset = lseek(output, 0, SEEK_CUR); + if (!!rec->no_buildid + && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) { + pr_err("Couldn't generating buildids. " + "Use --no-buildid to profile anyway.\n"); + return -1; + } - if (pipe_output) { - err = perf_session__synthesize_attrs(session, - process_synthesized_event); + rec->post_processing_offset = lseek(output, 0, SEEK_CUR); + + machine = perf_session__find_host_machine(session); + if (!machine) { + pr_err("Couldn't find native kernel information.\n"); + return -1; + } + + if (opts->pipe_output) { + err = perf_event__synthesize_attrs(tool, session, + process_synthesized_event); if (err < 0) { pr_err("Couldn't synthesize attrs.\n"); return err; } - err = perf_event__synthesize_event_types(process_synthesized_event, - session); + err = perf_event__synthesize_event_types(tool, process_synthesized_event, + machine); if (err < 0) { pr_err("Couldn't synthesize event_types.\n"); return err; @@ -651,56 +542,49 @@ static int __cmd_record(int argc, const char **argv) * return this more properly and also * propagate errors that now are calling die() */ - err = perf_event__synthesize_tracing_data(output, evsel_list, - process_synthesized_event, - session); + err = perf_event__synthesize_tracing_data(tool, output, evsel_list, + process_synthesized_event); if (err <= 0) { pr_err("Couldn't record tracing data.\n"); return err; } - advance_output(err); + advance_output(rec, err); } } - machine = perf_session__find_host_machine(session); - if (!machine) { - pr_err("Couldn't find native kernel information.\n"); - return -1; - } - - err = perf_event__synthesize_kernel_mmap(process_synthesized_event, - session, machine, "_text"); + err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, + machine, "_text"); if (err < 0) - err = perf_event__synthesize_kernel_mmap(process_synthesized_event, - session, machine, "_stext"); + err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event, + machine, "_stext"); if (err < 0) pr_err("Couldn't record kernel reference relocation symbol\n" "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" "Check /proc/kallsyms permission or run as root.\n"); - err = perf_event__synthesize_modules(process_synthesized_event, - session, machine); + err = perf_event__synthesize_modules(tool, process_synthesized_event, + machine); if (err < 0) pr_err("Couldn't record kernel module information.\n" "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n" "Check /proc/modules permission or run as root.\n"); if (perf_guest) - perf_session__process_machines(session, + perf_session__process_machines(session, tool, perf_event__synthesize_guest_os); - if (!system_wide) - perf_event__synthesize_thread_map(evsel_list->threads, + if (!opts->system_wide) + perf_event__synthesize_thread_map(tool, evsel_list->threads, process_synthesized_event, - session); + machine); else - perf_event__synthesize_threads(process_synthesized_event, - session); + perf_event__synthesize_threads(tool, process_synthesized_event, + machine); - if (realtime_prio) { + if (rec->realtime_prio) { struct sched_param param; - param.sched_priority = realtime_prio; + param.sched_priority = rec->realtime_prio; if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { pr_err("Could not set realtime priority.\n"); exit(-1); @@ -713,14 +597,14 @@ static int __cmd_record(int argc, const char **argv) * Let the child rip */ if (forks) - close(go_pipe[1]); + perf_evlist__start_workload(evsel_list); for (;;) { - int hits = samples; + int hits = rec->samples; - mmap_read_all(); + perf_record__mmap_read_all(rec); - if (hits == samples) { + if (hits == rec->samples) { if (done) break; err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1); @@ -741,9 +625,9 @@ static int __cmd_record(int argc, const char **argv) */ fprintf(stderr, "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n", - (double)bytes_written / 1024.0 / 1024.0, + (double)rec->bytes_written / 1024.0 / 1024.0, output_name, - bytes_written / 24); + rec->bytes_written / 24); return 0; @@ -758,58 +642,89 @@ static const char * const record_usage[] = { NULL }; -static bool force, append_file; +/* + * XXX Ideally would be local to cmd_record() and passed to a perf_record__new + * because we need to have access to it in perf_record__exit, that is called + * after cmd_record() exits, but since record_options need to be accessible to + * builtin-script, leave it here. + * + * At least we don't ouch it in all the other functions here directly. + * + * Just say no to tons of global variables, sigh. + */ +static struct perf_record record = { + .opts = { + .target_pid = -1, + .target_tid = -1, + .mmap_pages = UINT_MAX, + .user_freq = UINT_MAX, + .user_interval = ULLONG_MAX, + .freq = 1000, + .sample_id_all_avail = true, + }, + .write_mode = WRITE_FORCE, + .file_new = true, +}; +/* + * XXX Will stay a global variable till we fix builtin-script.c to stop messing + * with it and switch to use the library functions in perf_evlist that came + * from builtin-record.c, i.e. use perf_record_opts, + * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record', + * using pipes, etc. + */ const struct option record_options[] = { - OPT_CALLBACK('e', "event", &evsel_list, "event", + OPT_CALLBACK('e', "event", &record.evlist, "event", "event selector. use 'perf list' to list available events", parse_events_option), - OPT_CALLBACK(0, "filter", &evsel_list, "filter", + OPT_CALLBACK(0, "filter", &record.evlist, "filter", "event filter", parse_filter), - OPT_INTEGER('p', "pid", &target_pid, + OPT_INTEGER('p', "pid", &record.opts.target_pid, "record events on existing process id"), - OPT_INTEGER('t', "tid", &target_tid, + OPT_INTEGER('t', "tid", &record.opts.target_tid, "record events on existing thread id"), - OPT_INTEGER('r', "realtime", &realtime_prio, + OPT_INTEGER('r', "realtime", &record.realtime_prio, "collect data with this RT SCHED_FIFO priority"), - OPT_BOOLEAN('D', "no-delay", &nodelay, + OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay, "collect data without buffering"), - OPT_BOOLEAN('R', "raw-samples", &raw_samples, + OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples, "collect raw sample records from all opened counters"), - OPT_BOOLEAN('a', "all-cpus", &system_wide, + OPT_BOOLEAN('a', "all-cpus", &record.opts.system_wide, "system-wide collection from all CPUs"), - OPT_BOOLEAN('A', "append", &append_file, + OPT_BOOLEAN('A', "append", &record.append_file, "append to the output file to do incremental profiling"), - OPT_STRING('C', "cpu", &cpu_list, "cpu", + OPT_STRING('C', "cpu", &record.opts.cpu_list, "cpu", "list of cpus to monitor"), - OPT_BOOLEAN('f', "force", &force, + OPT_BOOLEAN('f', "force", &record.force, "overwrite existing data file (deprecated)"), - OPT_U64('c', "count", &user_interval, "event period to sample"), - OPT_STRING('o', "output", &output_name, "file", + OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"), + OPT_STRING('o', "output", &record.output_name, "file", "output file name"), - OPT_BOOLEAN('i', "no-inherit", &no_inherit, + OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit, "child tasks do not inherit counters"), - OPT_UINTEGER('F', "freq", &user_freq, "profile at this frequency"), - OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"), - OPT_BOOLEAN(0, "group", &group, + OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"), + OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages, + "number of mmap data pages"), + OPT_BOOLEAN(0, "group", &record.opts.group, "put the counters into a counter group"), - OPT_BOOLEAN('g', "call-graph", &call_graph, + OPT_BOOLEAN('g', "call-graph", &record.opts.call_graph, "do call-graph (stack chain/backtrace) recording"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"), - OPT_BOOLEAN('s', "stat", &inherit_stat, + OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat, "per thread counts"), - OPT_BOOLEAN('d', "data", &sample_address, + OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Sample addresses"), - OPT_BOOLEAN('T', "timestamp", &sample_time, "Sample timestamps"), - OPT_BOOLEAN('n', "no-samples", &no_samples, + OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"), + OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"), + OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples, "don't sample"), - OPT_BOOLEAN('N', "no-buildid-cache", &no_buildid_cache, + OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache, "do not update the buildid cache"), - OPT_BOOLEAN('B', "no-buildid", &no_buildid, + OPT_BOOLEAN('B', "no-buildid", &record.no_buildid, "do not collect buildids in perf.data"), - OPT_CALLBACK('G', "cgroup", &evsel_list, "name", + OPT_CALLBACK('G', "cgroup", &record.evlist, "name", "monitor event in cgroup name only", parse_cgroups), OPT_END() @@ -819,6 +734,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) { int err = -ENOMEM; struct perf_evsel *pos; + struct perf_evlist *evsel_list; + struct perf_record *rec = &record; perf_header__set_cmdline(argc, argv); @@ -826,23 +743,25 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) if (evsel_list == NULL) return -ENOMEM; + rec->evlist = evsel_list; + argc = parse_options(argc, argv, record_options, record_usage, PARSE_OPT_STOP_AT_NON_OPTION); - if (!argc && target_pid == -1 && target_tid == -1 && - !system_wide && !cpu_list) + if (!argc && rec->opts.target_pid == -1 && rec->opts.target_tid == -1 && + !rec->opts.system_wide && !rec->opts.cpu_list) usage_with_options(record_usage, record_options); - if (force && append_file) { + if (rec->force && rec->append_file) { fprintf(stderr, "Can't overwrite and append at the same time." " You need to choose between -f and -A"); usage_with_options(record_usage, record_options); - } else if (append_file) { - write_mode = WRITE_APPEND; + } else if (rec->append_file) { + rec->write_mode = WRITE_APPEND; } else { - write_mode = WRITE_FORCE; + rec->write_mode = WRITE_FORCE; } - if (nr_cgroups && !system_wide) { + if (nr_cgroups && !rec->opts.system_wide) { fprintf(stderr, "cgroup monitoring only available in" " system-wide mode\n"); usage_with_options(record_usage, record_options); @@ -860,7 +779,7 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) "If some relocation was applied (e.g. kexec) symbols may be misresolved\n" "even with a suitable vmlinux or kallsyms file.\n\n"); - if (no_buildid_cache || no_buildid) + if (rec->no_buildid_cache || rec->no_buildid) disable_buildid_cache(); if (evsel_list->nr_entries == 0 && @@ -869,43 +788,37 @@ int cmd_record(int argc, const char **argv, const char *prefix __used) goto out_symbol_exit; } - if (target_pid != -1) - target_tid = target_pid; + if (rec->opts.target_pid != -1) + rec->opts.target_tid = rec->opts.target_pid; - if (perf_evlist__create_maps(evsel_list, target_pid, - target_tid, cpu_list) < 0) + if (perf_evlist__create_maps(evsel_list, rec->opts.target_pid, + rec->opts.target_tid, rec->opts.cpu_list) < 0) usage_with_options(record_usage, record_options); list_for_each_entry(pos, &evsel_list->entries, node) { - if (perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, - evsel_list->threads->nr) < 0) - goto out_free_fd; if (perf_header__push_event(pos->attr.config, event_name(pos))) goto out_free_fd; } - if (perf_evlist__alloc_pollfd(evsel_list) < 0) - goto out_free_fd; - - if (user_interval != ULLONG_MAX) - default_interval = user_interval; - if (user_freq != UINT_MAX) - freq = user_freq; + if (rec->opts.user_interval != ULLONG_MAX) + rec->opts.default_interval = rec->opts.user_interval; + if (rec->opts.user_freq != UINT_MAX) + rec->opts.freq = rec->opts.user_freq; /* * User specified count overrides default frequency. */ - if (default_interval) - freq = 0; - else if (freq) { - default_interval = freq; + if (rec->opts.default_interval) + rec->opts.freq = 0; + else if (rec->opts.freq) { + rec->opts.default_interval = rec->opts.freq; } else { fprintf(stderr, "frequency and count are zero, aborting\n"); err = -EINVAL; goto out_free_fd; } - err = __cmd_record(argc, argv); + err = __cmd_record(&record, argc, argv); out_free_fd: perf_evlist__delete_maps(evsel_list); out_symbol_exit: diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 4d7c8340c32..25d34d483e4 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -25,6 +25,7 @@ #include "util/evsel.h" #include "util/header.h" #include "util/session.h" +#include "util/tool.h" #include "util/parse-options.h" #include "util/parse-events.h" @@ -35,38 +36,35 @@ #include <linux/bitmap.h> -static char const *input_name = "perf.data"; - -static bool force, use_tui, use_stdio; -static bool hide_unresolved; -static bool dont_use_callchains; -static bool show_full_info; - -static bool show_threads; -static struct perf_read_values show_threads_values; - -static const char default_pretty_printing_style[] = "normal"; -static const char *pretty_printing_style = default_pretty_printing_style; - -static char callchain_default_opt[] = "fractal,0.5,callee"; -static bool inverted_callchain; -static symbol_filter_t annotate_init; - -static const char *cpu_list; -static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); +struct perf_report { + struct perf_tool tool; + struct perf_session *session; + char const *input_name; + bool force, use_tui, use_stdio; + bool hide_unresolved; + bool dont_use_callchains; + bool show_full_info; + bool show_threads; + bool inverted_callchain; + struct perf_read_values show_threads_values; + const char *pretty_printing_style; + symbol_filter_t annotate_init; + const char *cpu_list; + DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); +}; -static int perf_session__add_hist_entry(struct perf_session *session, - struct addr_location *al, - struct perf_sample *sample, - struct perf_evsel *evsel) +static int perf_evsel__add_hist_entry(struct perf_evsel *evsel, + struct addr_location *al, + struct perf_sample *sample, + struct machine *machine) { struct symbol *parent = NULL; int err = 0; struct hist_entry *he; if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) { - err = perf_session__resolve_callchain(session, al->thread, - sample->callchain, &parent); + err = machine__resolve_callchain(machine, evsel, al->thread, + sample->callchain, &parent); if (err) return err; } @@ -76,7 +74,8 @@ static int perf_session__add_hist_entry(struct perf_session *session, return -ENOMEM; if (symbol_conf.use_callchain) { - err = callchain_append(he->callchain, &session->callchain_cursor, + err = callchain_append(he->callchain, + &evsel->hists.callchain_cursor, sample->period); if (err) return err; @@ -92,8 +91,7 @@ static int perf_session__add_hist_entry(struct perf_session *session, assert(evsel != NULL); err = -ENOMEM; - if (notes->src == NULL && - symbol__alloc_hist(he->ms.sym, session->evlist->nr_entries) < 0) + if (notes->src == NULL && symbol__alloc_hist(he->ms.sym) < 0) goto out; err = hist_entry__inc_addr_samples(he, evsel->idx, al->addr); @@ -106,30 +104,32 @@ out: } -static int process_sample_event(union perf_event *event, +static int process_sample_event(struct perf_tool *tool, + union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel, - struct perf_session *session) + struct machine *machine) { + struct perf_report *rep = container_of(tool, struct perf_report, tool); struct addr_location al; - if (perf_event__preprocess_sample(event, session, &al, sample, - annotate_init) < 0) { + if (perf_event__preprocess_sample(event, machine, &al, sample, + rep->annotate_init) < 0) { fprintf(stderr, "problem processing %d event, skipping it.\n", event->header.type); return -1; } - if (al.filtered || (hide_unresolved && al.sym == NULL)) + if (al.filtered || (rep->hide_unresolved && al.sym == NULL)) return 0; - if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) + if (rep->cpu_list && !test_bit(sample->cpu, rep->cpu_bitmap)) return 0; if (al.map != NULL) al.map->dso->hit = 1; - if (perf_session__add_hist_entry(session, &al, sample, evsel)) { + if (perf_evsel__add_hist_entry(evsel, &al, sample, machine)) { pr_debug("problem incrementing symbol period, skipping event\n"); return -1; } @@ -137,15 +137,17 @@ static int process_sample_event(union perf_event *event, return 0; } -static int process_read_event(union perf_event *event, +static int process_read_event(struct perf_tool *tool, + union perf_event *event, struct perf_sample *sample __used, - struct perf_session *session) + struct perf_evsel *evsel, + struct machine *machine __used) { - struct perf_evsel *evsel = perf_evlist__id2evsel(session->evlist, - event->read.id); - if (show_threads) { + struct perf_report *rep = container_of(tool, struct perf_report, tool); + + if (rep->show_threads) { const char *name = evsel ? event_name(evsel) : "unknown"; - perf_read_values_add_value(&show_threads_values, + perf_read_values_add_value(&rep->show_threads_values, event->read.pid, event->read.tid, event->read.id, name, @@ -159,8 +161,10 @@ static int process_read_event(union perf_event *event, return 0; } -static int perf_session__setup_sample_type(struct perf_session *self) +static int perf_report__setup_sample_type(struct perf_report *rep) { + struct perf_session *self = rep->session; + if (!(self->sample_type & PERF_SAMPLE_CALLCHAIN)) { if (sort__has_parent) { ui__warning("Selected --sort parent, but no " @@ -173,7 +177,8 @@ static int perf_session__setup_sample_type(struct perf_session *self) "you call 'perf record' without -g?\n"); return -1; } - } else if (!dont_use_callchains && callchain_param.mode != CHAIN_NONE && + } else if (!rep->dont_use_callchains && + callchain_param.mode != CHAIN_NONE && !symbol_conf.use_callchain) { symbol_conf.use_callchain = true; if (callchain_register_param(&callchain_param) < 0) { @@ -186,22 +191,6 @@ static int perf_session__setup_sample_type(struct perf_session *self) return 0; } -static struct perf_event_ops event_ops = { - .sample = process_sample_event, - .mmap = perf_event__process_mmap, - .comm = perf_event__process_comm, - .exit = perf_event__process_task, - .fork = perf_event__process_task, - .lost = perf_event__process_lost, - .read = process_read_event, - .attr = perf_event__process_attr, - .event_type = perf_event__process_event_type, - .tracing_data = perf_event__process_tracing_data, - .build_id = perf_event__process_build_id, - .ordered_samples = true, - .ordering_requires_timestamps = true, -}; - extern volatile int session_done; static void sig_handler(int sig __used) @@ -224,6 +213,7 @@ static size_t hists__fprintf_nr_sample_events(struct hists *self, } static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist, + struct perf_report *rep, const char *help) { struct perf_evsel *pos; @@ -241,18 +231,18 @@ static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist, parent_pattern == default_parent_pattern) { fprintf(stdout, "#\n# (%s)\n#\n", help); - if (show_threads) { - bool style = !strcmp(pretty_printing_style, "raw"); - perf_read_values_display(stdout, &show_threads_values, + if (rep->show_threads) { + bool style = !strcmp(rep->pretty_printing_style, "raw"); + perf_read_values_display(stdout, &rep->show_threads_values, style); - perf_read_values_destroy(&show_threads_values); + perf_read_values_destroy(&rep->show_threads_values); } } return 0; } -static int __cmd_report(void) +static int __cmd_report(struct perf_report *rep) { int ret = -EINVAL; u64 nr_samples; @@ -264,27 +254,31 @@ static int __cmd_report(void) signal(SIGINT, sig_handler); - session = perf_session__new(input_name, O_RDONLY, force, false, &event_ops); + session = perf_session__new(rep->input_name, O_RDONLY, + rep->force, false, &rep->tool); if (session == NULL) return -ENOMEM; - if (cpu_list) { - ret = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap); + rep->session = session; + + if (rep->cpu_list) { + ret = perf_session__cpu_bitmap(session, rep->cpu_list, + rep->cpu_bitmap); if (ret) goto out_delete; } if (use_browser <= 0) - perf_session__fprintf_info(session, stdout, show_full_info); + perf_session__fprintf_info(session, stdout, rep->show_full_info); - if (show_threads) - perf_read_values_init(&show_threads_values); + if (rep->show_threads) + perf_read_values_init(&rep->show_threads_values); - ret = perf_session__setup_sample_type(session); + ret = perf_report__setup_sample_type(rep); if (ret) goto out_delete; - ret = perf_session__process_events(session, &event_ops); + ret = perf_session__process_events(session, &rep->tool); if (ret) goto out_delete; @@ -327,7 +321,7 @@ static int __cmd_report(void) } if (nr_samples == 0) { - ui__warning("The %s file has no samples!\n", input_name); + ui__warning("The %s file has no samples!\n", session->filename); goto out_delete; } @@ -335,7 +329,7 @@ static int __cmd_report(void) perf_evlist__tui_browse_hists(session->evlist, help, NULL, NULL, 0); } else - perf_evlist__tty_browse_hists(session->evlist, help); + perf_evlist__tty_browse_hists(session->evlist, rep, help); out_delete: /* @@ -354,9 +348,9 @@ out_delete: } static int -parse_callchain_opt(const struct option *opt __used, const char *arg, - int unset) +parse_callchain_opt(const struct option *opt, const char *arg, int unset) { + struct perf_report *rep = (struct perf_report *)opt->value; char *tok, *tok2; char *endptr; @@ -364,7 +358,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg, * --no-call-graph */ if (unset) { - dont_use_callchains = true; + rep->dont_use_callchains = true; return 0; } @@ -412,7 +406,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg, goto setup; if (tok2[0] != 'c') { - callchain_param.print_limit = strtod(tok2, &endptr); + callchain_param.print_limit = strtoul(tok2, &endptr, 0); tok2 = strtok(NULL, ","); if (!tok2) goto setup; @@ -433,13 +427,34 @@ setup: return 0; } -static const char * const report_usage[] = { - "perf report [<options>] <command>", - NULL -}; - -static const struct option options[] = { - OPT_STRING('i', "input", &input_name, "file", +int cmd_report(int argc, const char **argv, const char *prefix __used) +{ + struct stat st; + char callchain_default_opt[] = "fractal,0.5,callee"; + const char * const report_usage[] = { + "perf report [<options>]", + NULL + }; + struct perf_report report = { + .tool = { + .sample = process_sample_event, + .mmap = perf_event__process_mmap, + .comm = perf_event__process_comm, + .exit = perf_event__process_task, + .fork = perf_event__process_task, + .lost = perf_event__process_lost, + .read = process_read_event, + .attr = perf_event__process_attr, + .event_type = perf_event__process_event_type, + .tracing_data = perf_event__process_tracing_data, + .build_id = perf_event__process_build_id, + .ordered_samples = true, + .ordering_requires_timestamps = true, + }, + .pretty_printing_style = "normal", + }; + const struct option options[] = { + OPT_STRING('i', "input", &report.input_name, "file", "input file name"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), @@ -449,17 +464,18 @@ static const struct option options[] = { "file", "vmlinux pathname"), OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name, "file", "kallsyms pathname"), - OPT_BOOLEAN('f', "force", &force, "don't complain, do it"), + OPT_BOOLEAN('f', "force", &report.force, "don't complain, do it"), OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules, "load module symbols - WARNING: use only with -k and LIVE kernel"), OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples, "Show a column with the number of samples"), - OPT_BOOLEAN('T', "threads", &show_threads, + OPT_BOOLEAN('T', "threads", &report.show_threads, "Show per-thread event counters"), - OPT_STRING(0, "pretty", &pretty_printing_style, "key", + OPT_STRING(0, "pretty", &report.pretty_printing_style, "key", "pretty printing style key: normal raw"), - OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"), - OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"), + OPT_BOOLEAN(0, "tui", &report.use_tui, "Use the TUI interface"), + OPT_BOOLEAN(0, "stdio", &report.use_stdio, + "Use the stdio interface"), OPT_STRING('s', "sort", &sort_order, "key[,key2...]", "sort by key(s): pid, comm, dso, symbol, parent"), OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization, @@ -468,13 +484,14 @@ static const struct option options[] = { "regex filter to identify parent, see: '--sort parent'"), OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other, "Only display entries with parent-match"), - OPT_CALLBACK_DEFAULT('g', "call-graph", NULL, "output_type,min_percent, call_order", - "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold and callchain order. " + OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order", + "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit and callchain order. " "Default: fractal,0.5,callee", &parse_callchain_opt, callchain_default_opt), - OPT_BOOLEAN('G', "inverted", &inverted_callchain, "alias for inverted call graph"), + OPT_BOOLEAN('G', "inverted", &report.inverted_callchain, + "alias for inverted call graph"), OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]", "only consider symbols in these dsos"), - OPT_STRING('C', "comms", &symbol_conf.comm_list_str, "comm[,comm...]", + OPT_STRING('c', "comms", &symbol_conf.comm_list_str, "comm[,comm...]", "only consider symbols in these comms"), OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]", "only consider these symbols"), @@ -484,12 +501,13 @@ static const struct option options[] = { OPT_STRING('t', "field-separator", &symbol_conf.field_sep, "separator", "separator for columns, no spaces will be added between " "columns '.' is reserved."), - OPT_BOOLEAN('U', "hide-unresolved", &hide_unresolved, + OPT_BOOLEAN('U', "hide-unresolved", &report.hide_unresolved, "Only display entries resolved to a symbol"), OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory", "Look for files with symbols relative to this directory"), - OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"), - OPT_BOOLEAN('I', "show-info", &show_full_info, + OPT_STRING('C', "cpu", &report.cpu_list, "cpu", + "list of cpus to profile"), + OPT_BOOLEAN('I', "show-info", &report.show_full_info, "Display extended information about perf.data file"), OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src, "Interleave source code with assembly code (default)"), @@ -500,24 +518,30 @@ static const struct option options[] = { OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period, "Show a column with the sum of periods"), OPT_END() -}; + }; -int cmd_report(int argc, const char **argv, const char *prefix __used) -{ argc = parse_options(argc, argv, options, report_usage, 0); - if (use_stdio) + if (report.use_stdio) use_browser = 0; - else if (use_tui) + else if (report.use_tui) use_browser = 1; - if (inverted_callchain) + if (report.inverted_callchain) callchain_param.order = ORDER_CALLER; - if (strcmp(input_name, "-") != 0) + if (!report.input_name || !strlen(report.input_name)) { + if (!fstat(STDIN_FILENO, &st) && S_ISFIFO(st.st_mode)) + report.input_name = "-"; + else + report.input_name = "perf.data"; + } + + if (strcmp(report.input_name, "-") != 0) setup_browser(true); else use_browser = 0; + /* * Only in the newt browser we are doing integrated annotation, * so don't allocate extra space that won't be used in the stdio @@ -525,7 +549,7 @@ int cmd_report(int argc, const char **argv, const char *prefix __used) */ if (use_browser > 0) { symbol_conf.priv_size = sizeof(struct annotation); - annotate_init = symbol__annotate_init; + report.annotate_init = symbol__annotate_init; /* * For searching by name on the "Browse map details". * providing it only in verbose mode not to bloat too @@ -572,5 +596,5 @@ int cmd_report(int argc, const char **argv, const char *prefix __used) sort_entry__setup_elide(&sort_comm, symbol_conf.comm_list, "comm", stdout); sort_entry__setup_elide(&sort_sym, symbol_conf.sym_list, "symbol", stdout); - return __cmd_report(); + return __cmd_report(&report); } diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 5177964943e..fb8b5f83b4a 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -2,11 +2,14 @@ #include "perf.h" #include "util/util.h" +#include "util/evlist.h" #include "util/cache.h" +#include "util/evsel.h" #include "util/symbol.h" #include "util/thread.h" #include "util/header.h" #include "util/session.h" +#include "util/tool.h" #include "util/parse-options.h" #include "util/trace-event.h" @@ -19,7 +22,7 @@ #include <pthread.h> #include <math.h> -static char const *input_name = "perf.data"; +static const char *input_name; static char default_sort_order[] = "avg, max, switch, runtime"; static const char *sort_order = default_sort_order; @@ -723,21 +726,21 @@ struct trace_migrate_task_event { struct trace_sched_handler { void (*switch_event)(struct trace_switch_event *, - struct perf_session *, + struct machine *, struct event *, int cpu, u64 timestamp, struct thread *thread); void (*runtime_event)(struct trace_runtime_event *, - struct perf_session *, + struct machine *, struct event *, int cpu, u64 timestamp, struct thread *thread); void (*wakeup_event)(struct trace_wakeup_event *, - struct perf_session *, + struct machine *, struct event *, int cpu, u64 timestamp, @@ -750,7 +753,7 @@ struct trace_sched_handler { struct thread *thread); void (*migrate_task_event)(struct trace_migrate_task_event *, - struct perf_session *session, + struct machine *machine, struct event *, int cpu, u64 timestamp, @@ -760,7 +763,7 @@ struct trace_sched_handler { static void replay_wakeup_event(struct trace_wakeup_event *wakeup_event, - struct perf_session *session __used, + struct machine *machine __used, struct event *event, int cpu __used, u64 timestamp __used, @@ -787,7 +790,7 @@ static u64 cpu_last_switched[MAX_CPUS]; static void replay_switch_event(struct trace_switch_event *switch_event, - struct perf_session *session __used, + struct machine *machine __used, struct event *event, int cpu, u64 timestamp, @@ -1021,7 +1024,7 @@ add_sched_in_event(struct work_atoms *atoms, u64 timestamp) static void latency_switch_event(struct trace_switch_event *switch_event, - struct perf_session *session, + struct machine *machine, struct event *event __used, int cpu, u64 timestamp, @@ -1045,8 +1048,8 @@ latency_switch_event(struct trace_switch_event *switch_event, die("hm, delta: %" PRIu64 " < 0 ?\n", delta); - sched_out = perf_session__findnew(session, switch_event->prev_pid); - sched_in = perf_session__findnew(session, switch_event->next_pid); + sched_out = machine__findnew_thread(machine, switch_event->prev_pid); + sched_in = machine__findnew_thread(machine, switch_event->next_pid); out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid); if (!out_events) { @@ -1074,13 +1077,13 @@ latency_switch_event(struct trace_switch_event *switch_event, static void latency_runtime_event(struct trace_runtime_event *runtime_event, - struct perf_session *session, + struct machine *machine, struct event *event __used, int cpu, u64 timestamp, struct thread *this_thread __used) { - struct thread *thread = perf_session__findnew(session, runtime_event->pid); + struct thread *thread = machine__findnew_thread(machine, runtime_event->pid); struct work_atoms *atoms = thread_atoms_search(&atom_root, thread, &cmp_pid); BUG_ON(cpu >= MAX_CPUS || cpu < 0); @@ -1097,7 +1100,7 @@ latency_runtime_event(struct trace_runtime_event *runtime_event, static void latency_wakeup_event(struct trace_wakeup_event *wakeup_event, - struct perf_session *session, + struct machine *machine, struct event *__event __used, int cpu __used, u64 timestamp, @@ -1111,7 +1114,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, if (!wakeup_event->success) return; - wakee = perf_session__findnew(session, wakeup_event->pid); + wakee = machine__findnew_thread(machine, wakeup_event->pid); atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid); if (!atoms) { thread_atoms_insert(wakee); @@ -1145,7 +1148,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event, static void latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event, - struct perf_session *session, + struct machine *machine, struct event *__event __used, int cpu __used, u64 timestamp, @@ -1161,7 +1164,7 @@ latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event, if (profile_cpu == -1) return; - migrant = perf_session__findnew(session, migrate_task_event->pid); + migrant = machine__findnew_thread(machine, migrate_task_event->pid); atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid); if (!atoms) { thread_atoms_insert(migrant); @@ -1356,12 +1359,13 @@ static void sort_lat(void) static struct trace_sched_handler *trace_handler; static void -process_sched_wakeup_event(void *data, struct perf_session *session, +process_sched_wakeup_event(struct perf_tool *tool __used, struct event *event, - int cpu __used, - u64 timestamp __used, - struct thread *thread __used) + struct perf_sample *sample, + struct machine *machine, + struct thread *thread) { + void *data = sample->raw_data; struct trace_wakeup_event wakeup_event; FILL_COMMON_FIELDS(wakeup_event, event, data); @@ -1373,8 +1377,8 @@ process_sched_wakeup_event(void *data, struct perf_session *session, FILL_FIELD(wakeup_event, cpu, event, data); if (trace_handler->wakeup_event) - trace_handler->wakeup_event(&wakeup_event, session, event, - cpu, timestamp, thread); + trace_handler->wakeup_event(&wakeup_event, machine, event, + sample->cpu, sample->time, thread); } /* @@ -1392,7 +1396,7 @@ static char next_shortname2 = '0'; static void map_switch_event(struct trace_switch_event *switch_event, - struct perf_session *session, + struct machine *machine, struct event *event __used, int this_cpu, u64 timestamp, @@ -1420,8 +1424,8 @@ map_switch_event(struct trace_switch_event *switch_event, die("hm, delta: %" PRIu64 " < 0 ?\n", delta); - sched_out = perf_session__findnew(session, switch_event->prev_pid); - sched_in = perf_session__findnew(session, switch_event->next_pid); + sched_out = machine__findnew_thread(machine, switch_event->prev_pid); + sched_in = machine__findnew_thread(machine, switch_event->next_pid); curr_thread[this_cpu] = sched_in; @@ -1469,14 +1473,15 @@ map_switch_event(struct trace_switch_event *switch_event, } } - static void -process_sched_switch_event(void *data, struct perf_session *session, +process_sched_switch_event(struct perf_tool *tool __used, struct event *event, - int this_cpu, - u64 timestamp __used, - struct thread *thread __used) + struct perf_sample *sample, + struct machine *machine, + struct thread *thread) { + int this_cpu = sample->cpu; + void *data = sample->raw_data; struct trace_switch_event switch_event; FILL_COMMON_FIELDS(switch_event, event, data); @@ -1498,19 +1503,20 @@ process_sched_switch_event(void *data, struct perf_session *session, nr_context_switch_bugs++; } if (trace_handler->switch_event) - trace_handler->switch_event(&switch_event, session, event, - this_cpu, timestamp, thread); + trace_handler->switch_event(&switch_event, machine, event, + this_cpu, sample->time, thread); curr_pid[this_cpu] = switch_event.next_pid; } static void -process_sched_runtime_event(void *data, struct perf_session *session, - struct event *event, - int cpu __used, - u64 timestamp __used, - struct thread *thread __used) +process_sched_runtime_event(struct perf_tool *tool __used, + struct event *event, + struct perf_sample *sample, + struct machine *machine, + struct thread *thread) { + void *data = sample->raw_data; struct trace_runtime_event runtime_event; FILL_ARRAY(runtime_event, comm, event, data); @@ -1519,16 +1525,18 @@ process_sched_runtime_event(void *data, struct perf_session *session, FILL_FIELD(runtime_event, vruntime, event, data); if (trace_handler->runtime_event) - trace_handler->runtime_event(&runtime_event, session, event, cpu, timestamp, thread); + trace_handler->runtime_event(&runtime_event, machine, event, + sample->cpu, sample->time, thread); } static void -process_sched_fork_event(void *data, +process_sched_fork_event(struct perf_tool *tool __used, struct event *event, - int cpu __used, - u64 timestamp __used, - struct thread *thread __used) + struct perf_sample *sample, + struct machine *machine __used, + struct thread *thread) { + void *data = sample->raw_data; struct trace_fork_event fork_event; FILL_COMMON_FIELDS(fork_event, event, data); @@ -1540,13 +1548,14 @@ process_sched_fork_event(void *data, if (trace_handler->fork_event) trace_handler->fork_event(&fork_event, event, - cpu, timestamp, thread); + sample->cpu, sample->time, thread); } static void -process_sched_exit_event(struct event *event, - int cpu __used, - u64 timestamp __used, +process_sched_exit_event(struct perf_tool *tool __used, + struct event *event, + struct perf_sample *sample __used, + struct machine *machine __used, struct thread *thread __used) { if (verbose) @@ -1554,12 +1563,13 @@ process_sched_exit_event(struct event *event, } static void -process_sched_migrate_task_event(void *data, struct perf_session *session, - struct event *event, - int cpu __used, - u64 timestamp __used, - struct thread *thread __used) +process_sched_migrate_task_event(struct perf_tool *tool __used, + struct event *event, + struct perf_sample *sample, + struct machine *machine, + struct thread *thread) { + void *data = sample->raw_data; struct trace_migrate_task_event migrate_task_event; FILL_COMMON_FIELDS(migrate_task_event, event, data); @@ -1570,67 +1580,47 @@ process_sched_migrate_task_event(void *data, struct perf_session *session, FILL_FIELD(migrate_task_event, cpu, event, data); if (trace_handler->migrate_task_event) - trace_handler->migrate_task_event(&migrate_task_event, session, - event, cpu, timestamp, thread); + trace_handler->migrate_task_event(&migrate_task_event, machine, + event, sample->cpu, + sample->time, thread); } -static void process_raw_event(union perf_event *raw_event __used, - struct perf_session *session, void *data, int cpu, - u64 timestamp, struct thread *thread) -{ - struct event *event; - int type; - - - type = trace_parse_common_type(data); - event = trace_find_event(type); - - if (!strcmp(event->name, "sched_switch")) - process_sched_switch_event(data, session, event, cpu, timestamp, thread); - if (!strcmp(event->name, "sched_stat_runtime")) - process_sched_runtime_event(data, session, event, cpu, timestamp, thread); - if (!strcmp(event->name, "sched_wakeup")) - process_sched_wakeup_event(data, session, event, cpu, timestamp, thread); - if (!strcmp(event->name, "sched_wakeup_new")) - process_sched_wakeup_event(data, session, event, cpu, timestamp, thread); - if (!strcmp(event->name, "sched_process_fork")) - process_sched_fork_event(data, event, cpu, timestamp, thread); - if (!strcmp(event->name, "sched_process_exit")) - process_sched_exit_event(event, cpu, timestamp, thread); - if (!strcmp(event->name, "sched_migrate_task")) - process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread); -} +typedef void (*tracepoint_handler)(struct perf_tool *tool, struct event *event, + struct perf_sample *sample, + struct machine *machine, + struct thread *thread); -static int process_sample_event(union perf_event *event, - struct perf_sample *sample, - struct perf_evsel *evsel __used, - struct perf_session *session) +static int perf_sched__process_tracepoint_sample(struct perf_tool *tool, + union perf_event *event __used, + struct perf_sample *sample, + struct perf_evsel *evsel, + struct machine *machine) { - struct thread *thread; - - if (!(session->sample_type & PERF_SAMPLE_RAW)) - return 0; + struct thread *thread = machine__findnew_thread(machine, sample->pid); - thread = perf_session__findnew(session, sample->pid); if (thread == NULL) { - pr_debug("problem processing %d event, skipping it.\n", - event->header.type); + pr_debug("problem processing %s event, skipping it.\n", + evsel->name); return -1; } - dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); + evsel->hists.stats.total_period += sample->period; + hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE); - if (profile_cpu != -1 && profile_cpu != (int)sample->cpu) - return 0; + if (evsel->handler.func != NULL) { + tracepoint_handler f = evsel->handler.func; - process_raw_event(event, session, sample->raw_data, sample->cpu, - sample->time, thread); + if (evsel->handler.data == NULL) + evsel->handler.data = trace_find_event(evsel->attr.config); + + f(tool, evsel->handler.data, sample, machine, thread); + } return 0; } -static struct perf_event_ops event_ops = { - .sample = process_sample_event, +static struct perf_tool perf_sched = { + .sample = perf_sched__process_tracepoint_sample, .comm = perf_event__process_comm, .lost = perf_event__process_lost, .fork = perf_event__process_task, @@ -1640,13 +1630,25 @@ static struct perf_event_ops event_ops = { static void read_events(bool destroy, struct perf_session **psession) { int err = -EINVAL; + const struct perf_evsel_str_handler handlers[] = { + { "sched:sched_switch", process_sched_switch_event, }, + { "sched:sched_stat_runtime", process_sched_runtime_event, }, + { "sched:sched_wakeup", process_sched_wakeup_event, }, + { "sched:sched_wakeup_new", process_sched_wakeup_event, }, + { "sched:sched_process_fork", process_sched_fork_event, }, + { "sched:sched_process_exit", process_sched_exit_event, }, + { "sched:sched_migrate_task", process_sched_migrate_task_event, }, + }; struct perf_session *session = perf_session__new(input_name, O_RDONLY, - 0, false, &event_ops); + 0, false, &perf_sched); if (session == NULL) die("No Memory"); + err = perf_evlist__set_tracepoints_handlers_array(session->evlist, handlers); + assert(err == 0); + if (perf_session__has_traces(session, "record -R")) { - err = perf_session__process_events(session, &event_ops); + err = perf_session__process_events(session, &perf_sched); if (err) die("Failed to process events, error %d", err); diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 2f62a295226..fd1909afcfd 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -7,6 +7,7 @@ #include "util/header.h" #include "util/parse-options.h" #include "util/session.h" +#include "util/tool.h" #include "util/symbol.h" #include "util/thread.h" #include "util/trace-event.h" @@ -23,6 +24,7 @@ static u64 nr_unordered; extern const struct option record_options[]; static bool no_callchain; static bool show_full_info; +static bool system_wide; static const char *cpu_list; static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS); @@ -315,7 +317,7 @@ static bool sample_addr_correlates_sym(struct perf_event_attr *attr) static void print_sample_addr(union perf_event *event, struct perf_sample *sample, - struct perf_session *session, + struct machine *machine, struct thread *thread, struct perf_event_attr *attr) { @@ -328,11 +330,11 @@ static void print_sample_addr(union perf_event *event, if (!sample_addr_correlates_sym(attr)) return; - thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION, - event->ip.pid, sample->addr, &al); + thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION, + sample->addr, &al); if (!al.map) - thread__find_addr_map(thread, session, cpumode, MAP__VARIABLE, - event->ip.pid, sample->addr, &al); + thread__find_addr_map(thread, machine, cpumode, MAP__VARIABLE, + sample->addr, &al); al.cpu = sample->cpu; al.sym = NULL; @@ -362,7 +364,7 @@ static void print_sample_addr(union perf_event *event, static void process_event(union perf_event *event __unused, struct perf_sample *sample, struct perf_evsel *evsel, - struct perf_session *session, + struct machine *machine, struct thread *thread) { struct perf_event_attr *attr = &evsel->attr; @@ -377,15 +379,15 @@ static void process_event(union perf_event *event __unused, sample->raw_size); if (PRINT_FIELD(ADDR)) - print_sample_addr(event, sample, session, thread, attr); + print_sample_addr(event, sample, machine, thread, attr); if (PRINT_FIELD(IP)) { if (!symbol_conf.use_callchain) printf(" "); else printf("\n"); - perf_session__print_ip(event, sample, session, - PRINT_FIELD(SYM), PRINT_FIELD(DSO)); + perf_event__print_ip(event, sample, machine, evsel, + PRINT_FIELD(SYM), PRINT_FIELD(DSO)); } printf("\n"); @@ -432,14 +434,16 @@ static int cleanup_scripting(void) return scripting_ops->stop_script(); } -static char const *input_name = "perf.data"; +static const char *input_name; -static int process_sample_event(union perf_event *event, +static int process_sample_event(struct perf_tool *tool __used, + union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel, - struct perf_session *session) + struct machine *machine) { - struct thread *thread = perf_session__findnew(session, event->ip.pid); + struct addr_location al; + struct thread *thread = machine__findnew_thread(machine, event->ip.tid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", @@ -458,16 +462,25 @@ static int process_sample_event(union perf_event *event, return 0; } + if (perf_event__preprocess_sample(event, machine, &al, sample, 0) < 0) { + pr_err("problem processing %d event, skipping it.\n", + event->header.type); + return -1; + } + + if (al.filtered) + return 0; + if (cpu_list && !test_bit(sample->cpu, cpu_bitmap)) return 0; - scripting_ops->process_event(event, sample, evsel, session, thread); + scripting_ops->process_event(event, sample, evsel, machine, thread); - session->hists.stats.total_period += sample->period; + evsel->hists.stats.total_period += sample->period; return 0; } -static struct perf_event_ops event_ops = { +static struct perf_tool perf_script = { .sample = process_sample_event, .mmap = perf_event__process_mmap, .comm = perf_event__process_comm, @@ -494,7 +507,7 @@ static int __cmd_script(struct perf_session *session) signal(SIGINT, sig_handler); - ret = perf_session__process_events(session, &event_ops); + ret = perf_session__process_events(session, &perf_script); if (debug_mode) pr_err("Misordered timestamps: %" PRIu64 "\n", nr_unordered); @@ -523,12 +536,6 @@ static struct script_spec *script_spec__new(const char *spec, return s; } -static void script_spec__delete(struct script_spec *s) -{ - free(s->spec); - free(s); -} - static void script_spec__add(struct script_spec *s) { list_add_tail(&s->node, &script_specs); @@ -554,16 +561,11 @@ static struct script_spec *script_spec__findnew(const char *spec, s = script_spec__new(spec, ops); if (!s) - goto out_delete_spec; + return NULL; script_spec__add(s); return s; - -out_delete_spec: - script_spec__delete(s); - - return NULL; } int script_spec_register(const char *spec, struct scripting_ops *ops) @@ -681,7 +683,8 @@ static int parse_output_fields(const struct option *opt __used, type = PERF_TYPE_RAW; else { fprintf(stderr, "Invalid event type in field string.\n"); - return -EINVAL; + rc = -EINVAL; + goto out; } if (output[type].user_set) @@ -923,6 +926,24 @@ static int read_script_info(struct script_desc *desc, const char *filename) return 0; } +static char *get_script_root(struct dirent *script_dirent, const char *suffix) +{ + char *script_root, *str; + + script_root = strdup(script_dirent->d_name); + if (!script_root) + return NULL; + + str = (char *)ends_with(script_root, suffix); + if (!str) { + free(script_root); + return NULL; + } + + *str = '\0'; + return script_root; +} + static int list_available_scripts(const struct option *opt __used, const char *s __used, int unset __used) { @@ -934,7 +955,6 @@ static int list_available_scripts(const struct option *opt __used, struct script_desc *desc; char first_half[BUFSIZ]; char *script_root; - char *str; snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path()); @@ -950,16 +970,14 @@ static int list_available_scripts(const struct option *opt __used, continue; for_each_script(lang_path, lang_dir, script_dirent, script_next) { - script_root = strdup(script_dirent.d_name); - str = (char *)ends_with(script_root, REPORT_SUFFIX); - if (str) { - *str = '\0'; + script_root = get_script_root(&script_dirent, REPORT_SUFFIX); + if (script_root) { desc = script_desc__findnew(script_root); snprintf(script_path, MAXPATHLEN, "%s/%s", lang_path, script_dirent.d_name); read_script_info(desc, script_path); + free(script_root); } - free(script_root); } } @@ -981,8 +999,7 @@ static char *get_script_path(const char *script_root, const char *suffix) char script_path[MAXPATHLEN]; DIR *scripts_dir, *lang_dir; char lang_path[MAXPATHLEN]; - char *str, *__script_root; - char *path = NULL; + char *__script_root; snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path()); @@ -998,23 +1015,18 @@ static char *get_script_path(const char *script_root, const char *suffix) continue; for_each_script(lang_path, lang_dir, script_dirent, script_next) { - __script_root = strdup(script_dirent.d_name); - str = (char *)ends_with(__script_root, suffix); - if (str) { - *str = '\0'; - if (strcmp(__script_root, script_root)) - continue; + __script_root = get_script_root(&script_dirent, suffix); + if (__script_root && !strcmp(script_root, __script_root)) { + free(__script_root); snprintf(script_path, MAXPATHLEN, "%s/%s", lang_path, script_dirent.d_name); - path = strdup(script_path); - free(__script_root); - break; + return strdup(script_path); } free(__script_root); } } - return path; + return NULL; } static bool is_top_script(const char *script_path) @@ -1083,7 +1095,11 @@ static const struct option options[] = { OPT_CALLBACK('f', "fields", NULL, "str", "comma separated output fields prepend with 'type:'. Valid types: hw,sw,trace,raw. Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,addr", parse_output_fields), - OPT_STRING('c', "cpu", &cpu_list, "cpu", "list of cpus to profile"), + OPT_BOOLEAN('a', "all-cpus", &system_wide, + "system-wide collection from all CPUs"), + OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"), + OPT_STRING('c', "comms", &symbol_conf.comm_list_str, "comm[,comm...]", + "only display events for these comms"), OPT_BOOLEAN('I', "show-info", &show_full_info, "display extended information from perf.data file"), OPT_END() @@ -1110,7 +1126,6 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) struct perf_session *session; char *script_path = NULL; const char **__argv; - bool system_wide; int i, j, err; setup_scripting(); @@ -1178,15 +1193,17 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) } if (!pid) { - system_wide = true; j = 0; dup2(live_pipe[1], 1); close(live_pipe[0]); - if (!is_top_script(argv[0])) + if (is_top_script(argv[0])) { + system_wide = true; + } else if (!system_wide) { system_wide = !have_cmd(argc - rep_args, &argv[rep_args]); + } __argv = malloc((argc + 6) * sizeof(const char *)); if (!__argv) @@ -1234,10 +1251,11 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) script_path = rep_script_path; if (script_path) { - system_wide = false; j = 0; - if (rec_script_path) + if (!rec_script_path) + system_wide = false; + else if (!system_wide) system_wide = !have_cmd(argc - 1, &argv[1]); __argv = malloc((argc + 2) * sizeof(const char *)); @@ -1261,7 +1279,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) if (!script_name) setup_pager(); - session = perf_session__new(input_name, O_RDONLY, 0, false, &event_ops); + session = perf_session__new(input_name, O_RDONLY, 0, false, &perf_script); if (session == NULL) return -ENOMEM; @@ -1287,7 +1305,7 @@ int cmd_script(int argc, const char **argv, const char *prefix __used) return -1; } - input = open(input_name, O_RDONLY); + input = open(session->filename, O_RDONLY); /* input_name */ if (input < 0) { perror("failed to open file"); exit(-1); diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 955930e0a5c..f5d2a63eba6 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -578,6 +578,33 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) avg / avg_stats(&walltime_nsecs_stats)); } +/* used for get_ratio_color() */ +enum grc_type { + GRC_STALLED_CYCLES_FE, + GRC_STALLED_CYCLES_BE, + GRC_CACHE_MISSES, + GRC_MAX_NR +}; + +static const char *get_ratio_color(enum grc_type type, double ratio) +{ + static const double grc_table[GRC_MAX_NR][3] = { + [GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 }, + [GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 }, + [GRC_CACHE_MISSES] = { 20.0, 10.0, 5.0 }, + }; + const char *color = PERF_COLOR_NORMAL; + + if (ratio > grc_table[type][0]) + color = PERF_COLOR_RED; + else if (ratio > grc_table[type][1]) + color = PERF_COLOR_MAGENTA; + else if (ratio > grc_table[type][2]) + color = PERF_COLOR_YELLOW; + + return color; +} + static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg) { double total, ratio = 0.0; @@ -588,13 +615,7 @@ static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __us if (total) ratio = avg / total * 100.0; - color = PERF_COLOR_NORMAL; - if (ratio > 50.0) - color = PERF_COLOR_RED; - else if (ratio > 30.0) - color = PERF_COLOR_MAGENTA; - else if (ratio > 10.0) - color = PERF_COLOR_YELLOW; + color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio); fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -611,13 +632,7 @@ static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __use if (total) ratio = avg / total * 100.0; - color = PERF_COLOR_NORMAL; - if (ratio > 75.0) - color = PERF_COLOR_RED; - else if (ratio > 50.0) - color = PERF_COLOR_MAGENTA; - else if (ratio > 20.0) - color = PERF_COLOR_YELLOW; + color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio); fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -634,13 +649,7 @@ static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double if (total) ratio = avg / total * 100.0; - color = PERF_COLOR_NORMAL; - if (ratio > 20.0) - color = PERF_COLOR_RED; - else if (ratio > 10.0) - color = PERF_COLOR_MAGENTA; - else if (ratio > 5.0) - color = PERF_COLOR_YELLOW; + color = get_ratio_color(GRC_CACHE_MISSES, ratio); fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -657,13 +666,7 @@ static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, dou if (total) ratio = avg / total * 100.0; - color = PERF_COLOR_NORMAL; - if (ratio > 20.0) - color = PERF_COLOR_RED; - else if (ratio > 10.0) - color = PERF_COLOR_MAGENTA; - else if (ratio > 5.0) - color = PERF_COLOR_YELLOW; + color = get_ratio_color(GRC_CACHE_MISSES, ratio); fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -680,13 +683,7 @@ static void print_l1_icache_misses(int cpu, struct perf_evsel *evsel __used, dou if (total) ratio = avg / total * 100.0; - color = PERF_COLOR_NORMAL; - if (ratio > 20.0) - color = PERF_COLOR_RED; - else if (ratio > 10.0) - color = PERF_COLOR_MAGENTA; - else if (ratio > 5.0) - color = PERF_COLOR_YELLOW; + color = get_ratio_color(GRC_CACHE_MISSES, ratio); fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -703,13 +700,7 @@ static void print_dtlb_cache_misses(int cpu, struct perf_evsel *evsel __used, do if (total) ratio = avg / total * 100.0; - color = PERF_COLOR_NORMAL; - if (ratio > 20.0) - color = PERF_COLOR_RED; - else if (ratio > 10.0) - color = PERF_COLOR_MAGENTA; - else if (ratio > 5.0) - color = PERF_COLOR_YELLOW; + color = get_ratio_color(GRC_CACHE_MISSES, ratio); fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -726,13 +717,7 @@ static void print_itlb_cache_misses(int cpu, struct perf_evsel *evsel __used, do if (total) ratio = avg / total * 100.0; - color = PERF_COLOR_NORMAL; - if (ratio > 20.0) - color = PERF_COLOR_RED; - else if (ratio > 10.0) - color = PERF_COLOR_MAGENTA; - else if (ratio > 5.0) - color = PERF_COLOR_YELLOW; + color = get_ratio_color(GRC_CACHE_MISSES, ratio); fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -749,13 +734,7 @@ static void print_ll_cache_misses(int cpu, struct perf_evsel *evsel __used, doub if (total) ratio = avg / total * 100.0; - color = PERF_COLOR_NORMAL; - if (ratio > 20.0) - color = PERF_COLOR_RED; - else if (ratio > 10.0) - color = PERF_COLOR_MAGENTA; - else if (ratio > 5.0) - color = PERF_COLOR_YELLOW; + color = get_ratio_color(GRC_CACHE_MISSES, ratio); fprintf(output, " # "); color_fprintf(output, color, "%6.2f%%", ratio); @@ -1108,22 +1087,13 @@ static const struct option options[] = { */ static int add_default_attributes(void) { - struct perf_evsel *pos; - size_t attr_nr = 0; - size_t c; - /* Set attrs if no event is selected and !null_run: */ if (null_run) return 0; if (!evsel_list->nr_entries) { - for (c = 0; c < ARRAY_SIZE(default_attrs); c++) { - pos = perf_evsel__new(default_attrs + c, c + attr_nr); - if (pos == NULL) - return -1; - perf_evlist__add(evsel_list, pos); - } - attr_nr += c; + if (perf_evlist__add_attrs_array(evsel_list, default_attrs) < 0) + return -1; } /* Detailed events get appended to the event list: */ @@ -1132,38 +1102,21 @@ static int add_default_attributes(void) return 0; /* Append detailed run extra attributes: */ - for (c = 0; c < ARRAY_SIZE(detailed_attrs); c++) { - pos = perf_evsel__new(detailed_attrs + c, c + attr_nr); - if (pos == NULL) - return -1; - perf_evlist__add(evsel_list, pos); - } - attr_nr += c; + if (perf_evlist__add_attrs_array(evsel_list, detailed_attrs) < 0) + return -1; if (detailed_run < 2) return 0; /* Append very detailed run extra attributes: */ - for (c = 0; c < ARRAY_SIZE(very_detailed_attrs); c++) { - pos = perf_evsel__new(very_detailed_attrs + c, c + attr_nr); - if (pos == NULL) - return -1; - perf_evlist__add(evsel_list, pos); - } + if (perf_evlist__add_attrs_array(evsel_list, very_detailed_attrs) < 0) + return -1; if (detailed_run < 3) return 0; /* Append very, very detailed run extra attributes: */ - for (c = 0; c < ARRAY_SIZE(very_very_detailed_attrs); c++) { - pos = perf_evsel__new(very_very_detailed_attrs + c, c + attr_nr); - if (pos == NULL) - return -1; - perf_evlist__add(evsel_list, pos); - } - - - return 0; + return perf_evlist__add_attrs_array(evsel_list, very_very_detailed_attrs); } int cmd_stat(int argc, const char **argv, const char *prefix __used) @@ -1267,8 +1220,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) list_for_each_entry(pos, &evsel_list->entries, node) { if (perf_evsel__alloc_stat_priv(pos) < 0 || - perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0 || - perf_evsel__alloc_fd(pos, evsel_list->cpus->nr, evsel_list->threads->nr) < 0) + perf_evsel__alloc_counts(pos, evsel_list->cpus->nr) < 0) goto out_free_fd; } diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index 831d1baeac3..2b9a7f497a2 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -7,6 +7,7 @@ #include "util/cache.h" #include "util/debug.h" +#include "util/debugfs.h" #include "util/evlist.h" #include "util/parse-options.h" #include "util/parse-events.h" @@ -14,8 +15,6 @@ #include "util/thread_map.h" #include "../../include/linux/hw_breakpoint.h" -static long page_size; - static int vmlinux_matches_kallsyms_filter(struct map *map __used, struct symbol *sym) { bool *visited = symbol__priv(sym); @@ -31,6 +30,7 @@ static int test__vmlinux_matches_kallsyms(void) struct map *kallsyms_map, *vmlinux_map; struct machine kallsyms, vmlinux; enum map_type type = MAP__FUNCTION; + long page_size = sysconf(_SC_PAGE_SIZE); struct ref_reloc_sym ref_reloc_sym = { .name = "_stext", }; /* @@ -247,7 +247,7 @@ static int trace_event__id(const char *evname) if (asprintf(&filename, "%s/syscalls/%s/id", - debugfs_path, evname) < 0) + tracing_events_path, evname) < 0) return -1; fd = open(filename, O_RDONLY); @@ -603,7 +603,7 @@ out_free_threads: #define TEST_ASSERT_VAL(text, cond) \ do { \ - if (!cond) { \ + if (!(cond)) { \ pr_debug("FAILED %s:%d %s\n", __FILE__, __LINE__, text); \ return -1; \ } \ @@ -759,6 +759,103 @@ static int test__checkevent_breakpoint_w(struct perf_evlist *evlist) return 0; } +static int test__checkevent_tracepoint_modifier(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); + + return test__checkevent_tracepoint(evlist); +} + +static int +test__checkevent_tracepoint_multi_modifier(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel; + + TEST_ASSERT_VAL("wrong number of entries", evlist->nr_entries > 1); + + list_for_each_entry(evsel, &evlist->entries, node) { + TEST_ASSERT_VAL("wrong exclude_user", + !evsel->attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", + evsel->attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); + } + + return test__checkevent_tracepoint_multi(evlist); +} + +static int test__checkevent_raw_modifier(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip); + + return test__checkevent_raw(evlist); +} + +static int test__checkevent_numeric_modifier(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip); + + return test__checkevent_numeric(evlist); +} + +static int test__checkevent_symbolic_name_modifier(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", !evsel->attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); + + return test__checkevent_symbolic_name(evlist); +} + +static int test__checkevent_symbolic_alias_modifier(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong exclude_user", !evsel->attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", evsel->attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", !evsel->attr.precise_ip); + + return test__checkevent_symbolic_alias(evlist); +} + +static int test__checkevent_genhw_modifier(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel = list_entry(evlist->entries.next, + struct perf_evsel, node); + + TEST_ASSERT_VAL("wrong exclude_user", evsel->attr.exclude_user); + TEST_ASSERT_VAL("wrong exclude_kernel", !evsel->attr.exclude_kernel); + TEST_ASSERT_VAL("wrong exclude_hv", evsel->attr.exclude_hv); + TEST_ASSERT_VAL("wrong precise_ip", evsel->attr.precise_ip); + + return test__checkevent_genhw(evlist); +} + static struct test__event_st { const char *name; __u32 type; @@ -808,6 +905,34 @@ static struct test__event_st { .name = "mem:0:w", .check = test__checkevent_breakpoint_w, }, + { + .name = "syscalls:sys_enter_open:k", + .check = test__checkevent_tracepoint_modifier, + }, + { + .name = "syscalls:*:u", + .check = test__checkevent_tracepoint_multi_modifier, + }, + { + .name = "r1:kp", + .check = test__checkevent_raw_modifier, + }, + { + .name = "1:1:hp", + .check = test__checkevent_numeric_modifier, + }, + { + .name = "instructions:h", + .check = test__checkevent_symbolic_name_modifier, + }, + { + .name = "faults:u", + .check = test__checkevent_symbolic_alias_modifier, + }, + { + .name = "L1-dcache-load-miss:kp", + .check = test__checkevent_genhw_modifier, + }, }; #define TEST__EVENTS_CNT (sizeof(test__events) / sizeof(struct test__event_st)) @@ -841,6 +966,336 @@ static int test__parse_events(void) return ret; } + +static int sched__get_first_possible_cpu(pid_t pid, cpu_set_t **maskp, + size_t *sizep) +{ + cpu_set_t *mask; + size_t size; + int i, cpu = -1, nrcpus = 1024; +realloc: + mask = CPU_ALLOC(nrcpus); + size = CPU_ALLOC_SIZE(nrcpus); + CPU_ZERO_S(size, mask); + + if (sched_getaffinity(pid, size, mask) == -1) { + CPU_FREE(mask); + if (errno == EINVAL && nrcpus < (1024 << 8)) { + nrcpus = nrcpus << 2; + goto realloc; + } + perror("sched_getaffinity"); + return -1; + } + + for (i = 0; i < nrcpus; i++) { + if (CPU_ISSET_S(i, size, mask)) { + if (cpu == -1) { + cpu = i; + *maskp = mask; + *sizep = size; + } else + CPU_CLR_S(i, size, mask); + } + } + + if (cpu == -1) + CPU_FREE(mask); + + return cpu; +} + +static int test__PERF_RECORD(void) +{ + struct perf_record_opts opts = { + .target_pid = -1, + .target_tid = -1, + .no_delay = true, + .freq = 10, + .mmap_pages = 256, + .sample_id_all_avail = true, + }; + cpu_set_t *cpu_mask = NULL; + size_t cpu_mask_size = 0; + struct perf_evlist *evlist = perf_evlist__new(NULL, NULL); + struct perf_evsel *evsel; + struct perf_sample sample; + const char *cmd = "sleep"; + const char *argv[] = { cmd, "1", NULL, }; + char *bname; + u64 sample_type, prev_time = 0; + bool found_cmd_mmap = false, + found_libc_mmap = false, + found_vdso_mmap = false, + found_ld_mmap = false; + int err = -1, errs = 0, i, wakeups = 0, sample_size; + u32 cpu; + int total_events = 0, nr_events[PERF_RECORD_MAX] = { 0, }; + + if (evlist == NULL || argv == NULL) { + pr_debug("Not enough memory to create evlist\n"); + goto out; + } + + /* + * We need at least one evsel in the evlist, use the default + * one: "cycles". + */ + err = perf_evlist__add_default(evlist); + if (err < 0) { + pr_debug("Not enough memory to create evsel\n"); + goto out_delete_evlist; + } + + /* + * Create maps of threads and cpus to monitor. In this case + * we start with all threads and cpus (-1, -1) but then in + * perf_evlist__prepare_workload we'll fill in the only thread + * we're monitoring, the one forked there. + */ + err = perf_evlist__create_maps(evlist, opts.target_pid, + opts.target_tid, opts.cpu_list); + if (err < 0) { + pr_debug("Not enough memory to create thread/cpu maps\n"); + goto out_delete_evlist; + } + + /* + * Prepare the workload in argv[] to run, it'll fork it, and then wait + * for perf_evlist__start_workload() to exec it. This is done this way + * so that we have time to open the evlist (calling sys_perf_event_open + * on all the fds) and then mmap them. + */ + err = perf_evlist__prepare_workload(evlist, &opts, argv); + if (err < 0) { + pr_debug("Couldn't run the workload!\n"); + goto out_delete_evlist; + } + + /* + * Config the evsels, setting attr->comm on the first one, etc. + */ + evsel = list_entry(evlist->entries.next, struct perf_evsel, node); + evsel->attr.sample_type |= PERF_SAMPLE_CPU; + evsel->attr.sample_type |= PERF_SAMPLE_TID; + evsel->attr.sample_type |= PERF_SAMPLE_TIME; + perf_evlist__config_attrs(evlist, &opts); + + err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask, + &cpu_mask_size); + if (err < 0) { + pr_debug("sched__get_first_possible_cpu: %s\n", strerror(errno)); + goto out_delete_evlist; + } + + cpu = err; + + /* + * So that we can check perf_sample.cpu on all the samples. + */ + if (sched_setaffinity(evlist->workload.pid, cpu_mask_size, cpu_mask) < 0) { + pr_debug("sched_setaffinity: %s\n", strerror(errno)); + goto out_free_cpu_mask; + } + + /* + * Call sys_perf_event_open on all the fds on all the evsels, + * grouping them if asked to. + */ + err = perf_evlist__open(evlist, opts.group); + if (err < 0) { + pr_debug("perf_evlist__open: %s\n", strerror(errno)); + goto out_delete_evlist; + } + + /* + * mmap the first fd on a given CPU and ask for events for the other + * fds in the same CPU to be injected in the same mmap ring buffer + * (using ioctl(PERF_EVENT_IOC_SET_OUTPUT)). + */ + err = perf_evlist__mmap(evlist, opts.mmap_pages, false); + if (err < 0) { + pr_debug("perf_evlist__mmap: %s\n", strerror(errno)); + goto out_delete_evlist; + } + + /* + * We'll need these two to parse the PERF_SAMPLE_* fields in each + * event. + */ + sample_type = perf_evlist__sample_type(evlist); + sample_size = __perf_evsel__sample_size(sample_type); + + /* + * Now that all is properly set up, enable the events, they will + * count just on workload.pid, which will start... + */ + perf_evlist__enable(evlist); + + /* + * Now! + */ + perf_evlist__start_workload(evlist); + + while (1) { + int before = total_events; + + for (i = 0; i < evlist->nr_mmaps; i++) { + union perf_event *event; + + while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) { + const u32 type = event->header.type; + const char *name = perf_event__name(type); + + ++total_events; + if (type < PERF_RECORD_MAX) + nr_events[type]++; + + err = perf_event__parse_sample(event, sample_type, + sample_size, true, + &sample, false); + if (err < 0) { + if (verbose) + perf_event__fprintf(event, stderr); + pr_debug("Couldn't parse sample\n"); + goto out_err; + } + + if (verbose) { + pr_info("%" PRIu64" %d ", sample.time, sample.cpu); + perf_event__fprintf(event, stderr); + } + + if (prev_time > sample.time) { + pr_debug("%s going backwards in time, prev=%" PRIu64 ", curr=%" PRIu64 "\n", + name, prev_time, sample.time); + ++errs; + } + + prev_time = sample.time; + + if (sample.cpu != cpu) { + pr_debug("%s with unexpected cpu, expected %d, got %d\n", + name, cpu, sample.cpu); + ++errs; + } + + if ((pid_t)sample.pid != evlist->workload.pid) { + pr_debug("%s with unexpected pid, expected %d, got %d\n", + name, evlist->workload.pid, sample.pid); + ++errs; + } + + if ((pid_t)sample.tid != evlist->workload.pid) { + pr_debug("%s with unexpected tid, expected %d, got %d\n", + name, evlist->workload.pid, sample.tid); + ++errs; + } + + if ((type == PERF_RECORD_COMM || + type == PERF_RECORD_MMAP || + type == PERF_RECORD_FORK || + type == PERF_RECORD_EXIT) && + (pid_t)event->comm.pid != evlist->workload.pid) { + pr_debug("%s with unexpected pid/tid\n", name); + ++errs; + } + + if ((type == PERF_RECORD_COMM || + type == PERF_RECORD_MMAP) && + event->comm.pid != event->comm.tid) { + pr_debug("%s with different pid/tid!\n", name); + ++errs; + } + + switch (type) { + case PERF_RECORD_COMM: + if (strcmp(event->comm.comm, cmd)) { + pr_debug("%s with unexpected comm!\n", name); + ++errs; + } + break; + case PERF_RECORD_EXIT: + goto found_exit; + case PERF_RECORD_MMAP: + bname = strrchr(event->mmap.filename, '/'); + if (bname != NULL) { + if (!found_cmd_mmap) + found_cmd_mmap = !strcmp(bname + 1, cmd); + if (!found_libc_mmap) + found_libc_mmap = !strncmp(bname + 1, "libc", 4); + if (!found_ld_mmap) + found_ld_mmap = !strncmp(bname + 1, "ld", 2); + } else if (!found_vdso_mmap) + found_vdso_mmap = !strcmp(event->mmap.filename, "[vdso]"); + break; + + case PERF_RECORD_SAMPLE: + /* Just ignore samples for now */ + break; + default: + pr_debug("Unexpected perf_event->header.type %d!\n", + type); + ++errs; + } + } + } + + /* + * We don't use poll here because at least at 3.1 times the + * PERF_RECORD_{!SAMPLE} events don't honour + * perf_event_attr.wakeup_events, just PERF_EVENT_SAMPLE does. + */ + if (total_events == before && false) + poll(evlist->pollfd, evlist->nr_fds, -1); + + sleep(1); + if (++wakeups > 5) { + pr_debug("No PERF_RECORD_EXIT event!\n"); + break; + } + } + +found_exit: + if (nr_events[PERF_RECORD_COMM] > 1) { + pr_debug("Excessive number of PERF_RECORD_COMM events!\n"); + ++errs; + } + + if (nr_events[PERF_RECORD_COMM] == 0) { + pr_debug("Missing PERF_RECORD_COMM for %s!\n", cmd); + ++errs; + } + + if (!found_cmd_mmap) { + pr_debug("PERF_RECORD_MMAP for %s missing!\n", cmd); + ++errs; + } + + if (!found_libc_mmap) { + pr_debug("PERF_RECORD_MMAP for %s missing!\n", "libc"); + ++errs; + } + + if (!found_ld_mmap) { + pr_debug("PERF_RECORD_MMAP for %s missing!\n", "ld"); + ++errs; + } + + if (!found_vdso_mmap) { + pr_debug("PERF_RECORD_MMAP for %s missing!\n", "[vdso]"); + ++errs; + } +out_err: + perf_evlist__munmap(evlist); +out_free_cpu_mask: + CPU_FREE(cpu_mask); +out_delete_evlist: + perf_evlist__delete(evlist); +out: + return (err < 0 || errs > 0) ? -1 : 0; +} + static struct test { const char *desc; int (*func)(void); @@ -866,45 +1321,89 @@ static struct test { .func = test__parse_events, }, { + .desc = "Validate PERF_RECORD_* events & perf_sample fields", + .func = test__PERF_RECORD, + }, + { .func = NULL, }, }; -static int __cmd_test(void) +static bool perf_test__matches(int curr, int argc, const char *argv[]) { - int i = 0; + int i; + + if (argc == 0) + return true; - page_size = sysconf(_SC_PAGE_SIZE); + for (i = 0; i < argc; ++i) { + char *end; + long nr = strtoul(argv[i], &end, 10); + + if (*end == '\0') { + if (nr == curr + 1) + return true; + continue; + } + + if (strstr(tests[curr].desc, argv[i])) + return true; + } + + return false; +} + +static int __cmd_test(int argc, const char *argv[]) +{ + int i = 0; while (tests[i].func) { - int err; - pr_info("%2d: %s:", i + 1, tests[i].desc); + int curr = i++, err; + + if (!perf_test__matches(curr, argc, argv)) + continue; + + pr_info("%2d: %s:", i, tests[curr].desc); pr_debug("\n--- start ---\n"); - err = tests[i].func(); - pr_debug("---- end ----\n%s:", tests[i].desc); + err = tests[curr].func(); + pr_debug("---- end ----\n%s:", tests[curr].desc); pr_info(" %s\n", err ? "FAILED!\n" : "Ok"); - ++i; } return 0; } -static const char * const test_usage[] = { - "perf test [<options>]", - NULL, -}; +static int perf_test__list(int argc, const char **argv) +{ + int i = 0; + + while (tests[i].func) { + int curr = i++; -static const struct option test_options[] = { + if (argc > 1 && !strstr(tests[curr].desc, argv[1])) + continue; + + pr_info("%2d: %s\n", i, tests[curr].desc); + } + + return 0; +} + +int cmd_test(int argc, const char **argv, const char *prefix __used) +{ + const char * const test_usage[] = { + "perf test [<options>] [{list <test-name-fragment>|[<test-name-fragments>|<test-numbers>]}]", + NULL, + }; + const struct option test_options[] = { OPT_INTEGER('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"), OPT_END() -}; + }; -int cmd_test(int argc, const char **argv, const char *prefix __used) -{ argc = parse_options(argc, argv, test_options, test_usage, 0); - if (argc) - usage_with_options(test_usage, test_options); + if (argc >= 1 && !strcmp(argv[0], "list")) + return perf_test__list(argc, argv); symbol_conf.priv_size = sizeof(int); symbol_conf.sort_by_name = true; @@ -915,5 +1414,5 @@ int cmd_test(int argc, const char **argv, const char *prefix __used) setup_pager(); - return __cmd_test(); + return __cmd_test(argc, argv); } diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index aa26f4d66d1..3b75b2e21ea 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -19,6 +19,7 @@ #include "util/color.h" #include <linux/list.h> #include "util/cache.h" +#include "util/evsel.h" #include <linux/rbtree.h> #include "util/symbol.h" #include "util/callchain.h" @@ -31,13 +32,14 @@ #include "util/event.h" #include "util/session.h" #include "util/svghelper.h" +#include "util/tool.h" #define SUPPORT_OLD_POWER_EVENTS 1 #define PWR_EVENT_EXIT -1 -static char const *input_name = "perf.data"; -static char const *output_name = "output.svg"; +static const char *input_name; +static const char *output_name = "output.svg"; static unsigned int numcpus; static u64 min_freq; /* Lowest CPU frequency seen */ @@ -273,25 +275,28 @@ static int cpus_cstate_state[MAX_CPUS]; static u64 cpus_pstate_start_times[MAX_CPUS]; static u64 cpus_pstate_state[MAX_CPUS]; -static int process_comm_event(union perf_event *event, +static int process_comm_event(struct perf_tool *tool __used, + union perf_event *event, struct perf_sample *sample __used, - struct perf_session *session __used) + struct machine *machine __used) { pid_set_comm(event->comm.tid, event->comm.comm); return 0; } -static int process_fork_event(union perf_event *event, +static int process_fork_event(struct perf_tool *tool __used, + union perf_event *event, struct perf_sample *sample __used, - struct perf_session *session __used) + struct machine *machine __used) { pid_fork(event->fork.pid, event->fork.ppid, event->fork.time); return 0; } -static int process_exit_event(union perf_event *event, +static int process_exit_event(struct perf_tool *tool __used, + union perf_event *event, struct perf_sample *sample __used, - struct perf_session *session __used) + struct machine *machine __used) { pid_exit(event->fork.pid, event->fork.time); return 0; @@ -486,14 +491,15 @@ static void sched_switch(int cpu, u64 timestamp, struct trace_entry *te) } -static int process_sample_event(union perf_event *event __used, +static int process_sample_event(struct perf_tool *tool __used, + union perf_event *event __used, struct perf_sample *sample, - struct perf_evsel *evsel __used, - struct perf_session *session) + struct perf_evsel *evsel, + struct machine *machine __used) { struct trace_entry *te; - if (session->sample_type & PERF_SAMPLE_TIME) { + if (evsel->attr.sample_type & PERF_SAMPLE_TIME) { if (!first_time || first_time > sample->time) first_time = sample->time; if (last_time < sample->time) @@ -501,7 +507,7 @@ static int process_sample_event(union perf_event *event __used, } te = (void *)sample->raw_data; - if (session->sample_type & PERF_SAMPLE_RAW && sample->raw_size > 0) { + if ((evsel->attr.sample_type & PERF_SAMPLE_RAW) && sample->raw_size > 0) { char *event_str; #ifdef SUPPORT_OLD_POWER_EVENTS struct power_entry_old *peo; @@ -974,7 +980,7 @@ static void write_svg_file(const char *filename) svg_close(); } -static struct perf_event_ops event_ops = { +static struct perf_tool perf_timechart = { .comm = process_comm_event, .fork = process_fork_event, .exit = process_exit_event, @@ -985,7 +991,7 @@ static struct perf_event_ops event_ops = { static int __cmd_timechart(void) { struct perf_session *session = perf_session__new(input_name, O_RDONLY, - 0, false, &event_ops); + 0, false, &perf_timechart); int ret = -EINVAL; if (session == NULL) @@ -994,7 +1000,7 @@ static int __cmd_timechart(void) if (!perf_session__has_traces(session, "timechart record")) goto out_delete; - ret = perf_session__process_events(session, &event_ops); + ret = perf_session__process_events(session, &perf_timechart); if (ret) goto out_delete; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index c9cdedb5813..4f81eeb9987 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -64,44 +64,6 @@ #include <linux/unistd.h> #include <linux/types.h> -static struct perf_top top = { - .count_filter = 5, - .delay_secs = 2, - .target_pid = -1, - .target_tid = -1, - .freq = 1000, /* 1 KHz */ -}; - -static bool system_wide = false; - -static bool use_tui, use_stdio; - -static bool sort_has_symbols; - -static bool dont_use_callchains; -static char callchain_default_opt[] = "fractal,0.5,callee"; - - -static int default_interval = 0; - -static bool kptr_restrict_warned; -static bool vmlinux_warned; -static bool inherit = false; -static int realtime_prio = 0; -static bool group = false; -static bool sample_id_all_avail = true; -static unsigned int mmap_pages = 128; - -static bool dump_symtab = false; - -static struct winsize winsize; - -static const char *sym_filter = NULL; -static int sym_pcnt_filter = 5; - -/* - * Source functions - */ void get_term_dimensions(struct winsize *ws) { @@ -125,21 +87,23 @@ void get_term_dimensions(struct winsize *ws) ws->ws_col = 80; } -static void update_print_entries(struct winsize *ws) +static void perf_top__update_print_entries(struct perf_top *top) { - top.print_entries = ws->ws_row; + top->print_entries = top->winsize.ws_row; - if (top.print_entries > 9) - top.print_entries -= 9; + if (top->print_entries > 9) + top->print_entries -= 9; } -static void sig_winch_handler(int sig __used) +static void perf_top__sig_winch(int sig __used, siginfo_t *info __used, void *arg) { - get_term_dimensions(&winsize); - update_print_entries(&winsize); + struct perf_top *top = arg; + + get_term_dimensions(&top->winsize); + perf_top__update_print_entries(top); } -static int parse_source(struct hist_entry *he) +static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) { struct symbol *sym; struct annotation *notes; @@ -170,7 +134,7 @@ static int parse_source(struct hist_entry *he) pthread_mutex_lock(¬es->lock); - if (symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) { + if (symbol__alloc_hist(sym) < 0) { pthread_mutex_unlock(¬es->lock); pr_err("Not enough memory for annotating '%s' symbol!\n", sym->name); @@ -181,7 +145,7 @@ static int parse_source(struct hist_entry *he) err = symbol__annotate(sym, map, 0); if (err == 0) { out_assign: - top.sym_filter_entry = he; + top->sym_filter_entry = he; } pthread_mutex_unlock(¬es->lock); @@ -194,14 +158,16 @@ static void __zero_source_counters(struct hist_entry *he) symbol__annotate_zero_histograms(sym); } -static void record_precise_ip(struct hist_entry *he, int counter, u64 ip) +static void perf_top__record_precise_ip(struct perf_top *top, + struct hist_entry *he, + int counter, u64 ip) { struct annotation *notes; struct symbol *sym; if (he == NULL || he->ms.sym == NULL || - ((top.sym_filter_entry == NULL || - top.sym_filter_entry->ms.sym != he->ms.sym) && use_browser != 1)) + ((top->sym_filter_entry == NULL || + top->sym_filter_entry->ms.sym != he->ms.sym) && use_browser != 1)) return; sym = he->ms.sym; @@ -210,8 +176,7 @@ static void record_precise_ip(struct hist_entry *he, int counter, u64 ip) if (pthread_mutex_trylock(¬es->lock)) return; - if (notes->src == NULL && - symbol__alloc_hist(sym, top.evlist->nr_entries) < 0) { + if (notes->src == NULL && symbol__alloc_hist(sym) < 0) { pthread_mutex_unlock(¬es->lock); pr_err("Not enough memory for annotating '%s' symbol!\n", sym->name); @@ -225,8 +190,9 @@ static void record_precise_ip(struct hist_entry *he, int counter, u64 ip) pthread_mutex_unlock(¬es->lock); } -static void show_details(struct hist_entry *he) +static void perf_top__show_details(struct perf_top *top) { + struct hist_entry *he = top->sym_filter_entry; struct annotation *notes; struct symbol *symbol; int more; @@ -242,15 +208,15 @@ static void show_details(struct hist_entry *he) if (notes->src == NULL) goto out_unlock; - printf("Showing %s for %s\n", event_name(top.sym_evsel), symbol->name); - printf(" Events Pcnt (>=%d%%)\n", sym_pcnt_filter); + printf("Showing %s for %s\n", event_name(top->sym_evsel), symbol->name); + printf(" Events Pcnt (>=%d%%)\n", top->sym_pcnt_filter); - more = symbol__annotate_printf(symbol, he->ms.map, top.sym_evsel->idx, - 0, sym_pcnt_filter, top.print_entries, 4); - if (top.zero) - symbol__annotate_zero_histogram(symbol, top.sym_evsel->idx); + more = symbol__annotate_printf(symbol, he->ms.map, top->sym_evsel->idx, + 0, top->sym_pcnt_filter, top->print_entries, 4); + if (top->zero) + symbol__annotate_zero_histogram(symbol, top->sym_evsel->idx); else - symbol__annotate_decay_histogram(symbol, top.sym_evsel->idx); + symbol__annotate_decay_histogram(symbol, top->sym_evsel->idx); if (more != 0) printf("%d lines not displayed, maybe increase display entries [e]\n", more); out_unlock: @@ -259,11 +225,9 @@ out_unlock: static const char CONSOLE_CLEAR[] = "[H[2J"; -static struct hist_entry * - perf_session__add_hist_entry(struct perf_session *session, - struct addr_location *al, - struct perf_sample *sample, - struct perf_evsel *evsel) +static struct hist_entry *perf_evsel__add_hist_entry(struct perf_evsel *evsel, + struct addr_location *al, + struct perf_sample *sample) { struct hist_entry *he; @@ -271,50 +235,51 @@ static struct hist_entry * if (he == NULL) return NULL; - session->hists.stats.total_period += sample->period; + evsel->hists.stats.total_period += sample->period; hists__inc_nr_events(&evsel->hists, PERF_RECORD_SAMPLE); return he; } -static void print_sym_table(void) +static void perf_top__print_sym_table(struct perf_top *top) { char bf[160]; int printed = 0; - const int win_width = winsize.ws_col - 1; + const int win_width = top->winsize.ws_col - 1; puts(CONSOLE_CLEAR); - perf_top__header_snprintf(&top, bf, sizeof(bf)); + perf_top__header_snprintf(top, bf, sizeof(bf)); printf("%s\n", bf); - perf_top__reset_sample_counters(&top); + perf_top__reset_sample_counters(top); printf("%-*.*s\n", win_width, win_width, graph_dotted_line); - if (top.sym_evsel->hists.stats.nr_lost_warned != - top.sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST]) { - top.sym_evsel->hists.stats.nr_lost_warned = - top.sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST]; + if (top->sym_evsel->hists.stats.nr_lost_warned != + top->sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST]) { + top->sym_evsel->hists.stats.nr_lost_warned = + top->sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST]; color_fprintf(stdout, PERF_COLOR_RED, "WARNING: LOST %d chunks, Check IO/CPU overload", - top.sym_evsel->hists.stats.nr_lost_warned); + top->sym_evsel->hists.stats.nr_lost_warned); ++printed; } - if (top.sym_filter_entry) { - show_details(top.sym_filter_entry); + if (top->sym_filter_entry) { + perf_top__show_details(top); return; } - hists__collapse_resort_threaded(&top.sym_evsel->hists); - hists__output_resort_threaded(&top.sym_evsel->hists); - hists__decay_entries_threaded(&top.sym_evsel->hists, - top.hide_user_symbols, - top.hide_kernel_symbols); - hists__output_recalc_col_len(&top.sym_evsel->hists, winsize.ws_row - 3); + hists__collapse_resort_threaded(&top->sym_evsel->hists); + hists__output_resort_threaded(&top->sym_evsel->hists); + hists__decay_entries_threaded(&top->sym_evsel->hists, + top->hide_user_symbols, + top->hide_kernel_symbols); + hists__output_recalc_col_len(&top->sym_evsel->hists, + top->winsize.ws_row - 3); putchar('\n'); - hists__fprintf(&top.sym_evsel->hists, NULL, false, false, - winsize.ws_row - 4 - printed, win_width, stdout); + hists__fprintf(&top->sym_evsel->hists, NULL, false, false, + top->winsize.ws_row - 4 - printed, win_width, stdout); } static void prompt_integer(int *target, const char *msg) @@ -352,17 +317,17 @@ static void prompt_percent(int *target, const char *msg) *target = tmp; } -static void prompt_symbol(struct hist_entry **target, const char *msg) +static void perf_top__prompt_symbol(struct perf_top *top, const char *msg) { char *buf = malloc(0), *p; - struct hist_entry *syme = *target, *n, *found = NULL; + struct hist_entry *syme = top->sym_filter_entry, *n, *found = NULL; struct rb_node *next; size_t dummy = 0; /* zero counters of active symbol */ if (syme) { __zero_source_counters(syme); - *target = NULL; + top->sym_filter_entry = NULL; } fprintf(stdout, "\n%s: ", msg); @@ -373,7 +338,7 @@ static void prompt_symbol(struct hist_entry **target, const char *msg) if (p) *p = 0; - next = rb_first(&top.sym_evsel->hists.entries); + next = rb_first(&top->sym_evsel->hists.entries); while (next) { n = rb_entry(next, struct hist_entry, rb_node); if (n->ms.sym && !strcmp(buf, n->ms.sym->name)) { @@ -386,47 +351,46 @@ static void prompt_symbol(struct hist_entry **target, const char *msg) if (!found) { fprintf(stderr, "Sorry, %s is not active.\n", buf); sleep(1); - return; } else - parse_source(found); + perf_top__parse_source(top, found); out_free: free(buf); } -static void print_mapped_keys(void) +static void perf_top__print_mapped_keys(struct perf_top *top) { char *name = NULL; - if (top.sym_filter_entry) { - struct symbol *sym = top.sym_filter_entry->ms.sym; + if (top->sym_filter_entry) { + struct symbol *sym = top->sym_filter_entry->ms.sym; name = sym->name; } fprintf(stdout, "\nMapped keys:\n"); - fprintf(stdout, "\t[d] display refresh delay. \t(%d)\n", top.delay_secs); - fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", top.print_entries); + fprintf(stdout, "\t[d] display refresh delay. \t(%d)\n", top->delay_secs); + fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", top->print_entries); - if (top.evlist->nr_entries > 1) - fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(top.sym_evsel)); + if (top->evlist->nr_entries > 1) + fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(top->sym_evsel)); - fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", top.count_filter); + fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", top->count_filter); - fprintf(stdout, "\t[F] annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter); + fprintf(stdout, "\t[F] annotate display filter (percent). \t(%d%%)\n", top->sym_pcnt_filter); fprintf(stdout, "\t[s] annotate symbol. \t(%s)\n", name?: "NULL"); fprintf(stdout, "\t[S] stop annotation.\n"); fprintf(stdout, "\t[K] hide kernel_symbols symbols. \t(%s)\n", - top.hide_kernel_symbols ? "yes" : "no"); + top->hide_kernel_symbols ? "yes" : "no"); fprintf(stdout, "\t[U] hide user symbols. \t(%s)\n", - top.hide_user_symbols ? "yes" : "no"); - fprintf(stdout, "\t[z] toggle sample zeroing. \t(%d)\n", top.zero ? 1 : 0); + top->hide_user_symbols ? "yes" : "no"); + fprintf(stdout, "\t[z] toggle sample zeroing. \t(%d)\n", top->zero ? 1 : 0); fprintf(stdout, "\t[qQ] quit.\n"); } -static int key_mapped(int c) +static int perf_top__key_mapped(struct perf_top *top, int c) { switch (c) { case 'd': @@ -442,7 +406,7 @@ static int key_mapped(int c) case 'S': return 1; case 'E': - return top.evlist->nr_entries > 1 ? 1 : 0; + return top->evlist->nr_entries > 1 ? 1 : 0; default: break; } @@ -450,13 +414,13 @@ static int key_mapped(int c) return 0; } -static void handle_keypress(int c) +static void perf_top__handle_keypress(struct perf_top *top, int c) { - if (!key_mapped(c)) { + if (!perf_top__key_mapped(top, c)) { struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; struct termios tc, save; - print_mapped_keys(); + perf_top__print_mapped_keys(top); fprintf(stdout, "\nEnter selection, or unmapped key to continue: "); fflush(stdout); @@ -471,81 +435,86 @@ static void handle_keypress(int c) c = getc(stdin); tcsetattr(0, TCSAFLUSH, &save); - if (!key_mapped(c)) + if (!perf_top__key_mapped(top, c)) return; } switch (c) { case 'd': - prompt_integer(&top.delay_secs, "Enter display delay"); - if (top.delay_secs < 1) - top.delay_secs = 1; + prompt_integer(&top->delay_secs, "Enter display delay"); + if (top->delay_secs < 1) + top->delay_secs = 1; break; case 'e': - prompt_integer(&top.print_entries, "Enter display entries (lines)"); - if (top.print_entries == 0) { - sig_winch_handler(SIGWINCH); - signal(SIGWINCH, sig_winch_handler); + prompt_integer(&top->print_entries, "Enter display entries (lines)"); + if (top->print_entries == 0) { + struct sigaction act = { + .sa_sigaction = perf_top__sig_winch, + .sa_flags = SA_SIGINFO, + }; + perf_top__sig_winch(SIGWINCH, NULL, top); + sigaction(SIGWINCH, &act, NULL); } else signal(SIGWINCH, SIG_DFL); break; case 'E': - if (top.evlist->nr_entries > 1) { + if (top->evlist->nr_entries > 1) { /* Select 0 as the default event: */ int counter = 0; fprintf(stderr, "\nAvailable events:"); - list_for_each_entry(top.sym_evsel, &top.evlist->entries, node) - fprintf(stderr, "\n\t%d %s", top.sym_evsel->idx, event_name(top.sym_evsel)); + list_for_each_entry(top->sym_evsel, &top->evlist->entries, node) + fprintf(stderr, "\n\t%d %s", top->sym_evsel->idx, event_name(top->sym_evsel)); prompt_integer(&counter, "Enter details event counter"); - if (counter >= top.evlist->nr_entries) { - top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node); - fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(top.sym_evsel)); + if (counter >= top->evlist->nr_entries) { + top->sym_evsel = list_entry(top->evlist->entries.next, struct perf_evsel, node); + fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(top->sym_evsel)); sleep(1); break; } - list_for_each_entry(top.sym_evsel, &top.evlist->entries, node) - if (top.sym_evsel->idx == counter) + list_for_each_entry(top->sym_evsel, &top->evlist->entries, node) + if (top->sym_evsel->idx == counter) break; } else - top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node); + top->sym_evsel = list_entry(top->evlist->entries.next, struct perf_evsel, node); break; case 'f': - prompt_integer(&top.count_filter, "Enter display event count filter"); + prompt_integer(&top->count_filter, "Enter display event count filter"); break; case 'F': - prompt_percent(&sym_pcnt_filter, "Enter details display event filter (percent)"); + prompt_percent(&top->sym_pcnt_filter, + "Enter details display event filter (percent)"); break; case 'K': - top.hide_kernel_symbols = !top.hide_kernel_symbols; + top->hide_kernel_symbols = !top->hide_kernel_symbols; break; case 'q': case 'Q': printf("exiting.\n"); - if (dump_symtab) - perf_session__fprintf_dsos(top.session, stderr); + if (top->dump_symtab) + perf_session__fprintf_dsos(top->session, stderr); exit(0); case 's': - prompt_symbol(&top.sym_filter_entry, "Enter details symbol"); + perf_top__prompt_symbol(top, "Enter details symbol"); break; case 'S': - if (!top.sym_filter_entry) + if (!top->sym_filter_entry) break; else { - struct hist_entry *syme = top.sym_filter_entry; + struct hist_entry *syme = top->sym_filter_entry; - top.sym_filter_entry = NULL; + top->sym_filter_entry = NULL; __zero_source_counters(syme); } break; case 'U': - top.hide_user_symbols = !top.hide_user_symbols; + top->hide_user_symbols = !top->hide_user_symbols; break; case 'z': - top.zero = !top.zero; + top->zero = !top->zero; break; default: break; @@ -563,28 +532,30 @@ static void perf_top__sort_new_samples(void *arg) hists__collapse_resort_threaded(&t->sym_evsel->hists); hists__output_resort_threaded(&t->sym_evsel->hists); hists__decay_entries_threaded(&t->sym_evsel->hists, - top.hide_user_symbols, - top.hide_kernel_symbols); + t->hide_user_symbols, + t->hide_kernel_symbols); } -static void *display_thread_tui(void *arg __used) +static void *display_thread_tui(void *arg) { + struct perf_top *top = arg; const char *help = "For a higher level overview, try: perf top --sort comm,dso"; - perf_top__sort_new_samples(&top); - perf_evlist__tui_browse_hists(top.evlist, help, + perf_top__sort_new_samples(top); + perf_evlist__tui_browse_hists(top->evlist, help, perf_top__sort_new_samples, - &top, top.delay_secs); + top, top->delay_secs); exit_browser(0); exit(0); return NULL; } -static void *display_thread(void *arg __used) +static void *display_thread(void *arg) { struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; struct termios tc, save; + struct perf_top *top = arg; int delay_msecs, c; tcgetattr(0, &save); @@ -595,13 +566,13 @@ static void *display_thread(void *arg __used) pthread__unblock_sigwinch(); repeat: - delay_msecs = top.delay_secs * 1000; + delay_msecs = top->delay_secs * 1000; tcsetattr(0, TCSANOW, &tc); /* trash return*/ getc(stdin); while (1) { - print_sym_table(); + perf_top__print_sym_table(top); /* * Either timeout expired or we got an EINTR due to SIGWINCH, * refresh screen in both cases. @@ -621,7 +592,7 @@ process_hotkey: c = getc(stdin); tcsetattr(0, TCSAFLUSH, &save); - handle_keypress(c); + perf_top__handle_keypress(top, c); goto repeat; return NULL; @@ -673,47 +644,17 @@ static int symbol_filter(struct map *map __used, struct symbol *sym) return 0; } -static void perf_event__process_sample(const union perf_event *event, +static void perf_event__process_sample(struct perf_tool *tool, + const union perf_event *event, struct perf_evsel *evsel, struct perf_sample *sample, - struct perf_session *session) + struct machine *machine) { + struct perf_top *top = container_of(tool, struct perf_top, tool); struct symbol *parent = NULL; u64 ip = event->ip.ip; struct addr_location al; - struct machine *machine; int err; - u8 origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - - ++top.samples; - - switch (origin) { - case PERF_RECORD_MISC_USER: - ++top.us_samples; - if (top.hide_user_symbols) - return; - machine = perf_session__find_host_machine(session); - break; - case PERF_RECORD_MISC_KERNEL: - ++top.kernel_samples; - if (top.hide_kernel_symbols) - return; - machine = perf_session__find_host_machine(session); - break; - case PERF_RECORD_MISC_GUEST_KERNEL: - ++top.guest_kernel_samples; - machine = perf_session__find_machine(session, event->ip.pid); - break; - case PERF_RECORD_MISC_GUEST_USER: - ++top.guest_us_samples; - /* - * TODO: we don't process guest user from host side - * except simple counting. - */ - return; - default: - return; - } if (!machine && perf_guest) { pr_err("Can't find guest [%d]'s kernel information\n", @@ -722,14 +663,14 @@ static void perf_event__process_sample(const union perf_event *event, } if (event->header.misc & PERF_RECORD_MISC_EXACT_IP) - top.exact_samples++; + top->exact_samples++; - if (perf_event__preprocess_sample(event, session, &al, sample, + if (perf_event__preprocess_sample(event, machine, &al, sample, symbol_filter) < 0 || al.filtered) return; - if (!kptr_restrict_warned && + if (!top->kptr_restrict_warned && symbol_conf.kptr_restrict && al.cpumode == PERF_RECORD_MISC_KERNEL) { ui__warning( @@ -740,7 +681,7 @@ static void perf_event__process_sample(const union perf_event *event, " modules" : ""); if (use_browser <= 0) sleep(5); - kptr_restrict_warned = true; + top->kptr_restrict_warned = true; } if (al.sym == NULL) { @@ -756,7 +697,7 @@ static void perf_event__process_sample(const union perf_event *event, * --hide-kernel-symbols, even if the user specifies an * invalid --vmlinux ;-) */ - if (!kptr_restrict_warned && !vmlinux_warned && + if (!top->kptr_restrict_warned && !top->vmlinux_warned && al.map == machine->vmlinux_maps[MAP__FUNCTION] && RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) { if (symbol_conf.vmlinux_name) { @@ -769,7 +710,7 @@ static void perf_event__process_sample(const union perf_event *event, if (use_browser <= 0) sleep(5); - vmlinux_warned = true; + top->vmlinux_warned = true; } } @@ -778,70 +719,109 @@ static void perf_event__process_sample(const union perf_event *event, if ((sort__has_parent || symbol_conf.use_callchain) && sample->callchain) { - err = perf_session__resolve_callchain(session, al.thread, - sample->callchain, &parent); + err = machine__resolve_callchain(machine, evsel, al.thread, + sample->callchain, &parent); if (err) return; } - he = perf_session__add_hist_entry(session, &al, sample, evsel); + he = perf_evsel__add_hist_entry(evsel, &al, sample); if (he == NULL) { pr_err("Problem incrementing symbol period, skipping event\n"); return; } if (symbol_conf.use_callchain) { - err = callchain_append(he->callchain, &session->callchain_cursor, + err = callchain_append(he->callchain, &evsel->hists.callchain_cursor, sample->period); if (err) return; } - if (sort_has_symbols) - record_precise_ip(he, evsel->idx, ip); + if (top->sort_has_symbols) + perf_top__record_precise_ip(top, he, evsel->idx, ip); } return; } -static void perf_session__mmap_read_idx(struct perf_session *self, int idx) +static void perf_top__mmap_read_idx(struct perf_top *top, int idx) { struct perf_sample sample; struct perf_evsel *evsel; + struct perf_session *session = top->session; union perf_event *event; + struct machine *machine; + u8 origin; int ret; - while ((event = perf_evlist__mmap_read(top.evlist, idx)) != NULL) { - ret = perf_session__parse_sample(self, event, &sample); + while ((event = perf_evlist__mmap_read(top->evlist, idx)) != NULL) { + ret = perf_session__parse_sample(session, event, &sample); if (ret) { pr_err("Can't parse sample, err = %d\n", ret); continue; } - evsel = perf_evlist__id2evsel(self->evlist, sample.id); + evsel = perf_evlist__id2evsel(session->evlist, sample.id); assert(evsel != NULL); + origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; + if (event->header.type == PERF_RECORD_SAMPLE) - perf_event__process_sample(event, evsel, &sample, self); - else if (event->header.type < PERF_RECORD_MAX) { + ++top->samples; + + switch (origin) { + case PERF_RECORD_MISC_USER: + ++top->us_samples; + if (top->hide_user_symbols) + continue; + machine = perf_session__find_host_machine(session); + break; + case PERF_RECORD_MISC_KERNEL: + ++top->kernel_samples; + if (top->hide_kernel_symbols) + continue; + machine = perf_session__find_host_machine(session); + break; + case PERF_RECORD_MISC_GUEST_KERNEL: + ++top->guest_kernel_samples; + machine = perf_session__find_machine(session, event->ip.pid); + break; + case PERF_RECORD_MISC_GUEST_USER: + ++top->guest_us_samples; + /* + * TODO: we don't process guest user from host side + * except simple counting. + */ + /* Fall thru */ + default: + continue; + } + + + if (event->header.type == PERF_RECORD_SAMPLE) { + perf_event__process_sample(&top->tool, event, evsel, + &sample, machine); + } else if (event->header.type < PERF_RECORD_MAX) { hists__inc_nr_events(&evsel->hists, event->header.type); - perf_event__process(event, &sample, self); + perf_event__process(&top->tool, event, &sample, machine); } else - ++self->hists.stats.nr_unknown_events; + ++session->hists.stats.nr_unknown_events; } } -static void perf_session__mmap_read(struct perf_session *self) +static void perf_top__mmap_read(struct perf_top *top) { int i; - for (i = 0; i < top.evlist->nr_mmaps; i++) - perf_session__mmap_read_idx(self, i); + for (i = 0; i < top->evlist->nr_mmaps; i++) + perf_top__mmap_read_idx(top, i); } -static void start_counters(struct perf_evlist *evlist) +static void perf_top__start_counters(struct perf_top *top) { struct perf_evsel *counter, *first; + struct perf_evlist *evlist = top->evlist; first = list_entry(evlist->entries.next, struct perf_evsel, node); @@ -849,15 +829,15 @@ static void start_counters(struct perf_evlist *evlist) struct perf_event_attr *attr = &counter->attr; struct xyarray *group_fd = NULL; - if (group && counter != first) + if (top->group && counter != first) group_fd = first->fd; attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; - if (top.freq) { + if (top->freq) { attr->sample_type |= PERF_SAMPLE_PERIOD; attr->freq = 1; - attr->sample_freq = top.freq; + attr->sample_freq = top->freq; } if (evlist->nr_entries > 1) { @@ -870,23 +850,23 @@ static void start_counters(struct perf_evlist *evlist) attr->mmap = 1; attr->comm = 1; - attr->inherit = inherit; + attr->inherit = top->inherit; retry_sample_id: - attr->sample_id_all = sample_id_all_avail ? 1 : 0; + attr->sample_id_all = top->sample_id_all_avail ? 1 : 0; try_again: - if (perf_evsel__open(counter, top.evlist->cpus, - top.evlist->threads, group, + if (perf_evsel__open(counter, top->evlist->cpus, + top->evlist->threads, top->group, group_fd) < 0) { int err = errno; if (err == EPERM || err == EACCES) { ui__error_paranoid(); goto out_err; - } else if (err == EINVAL && sample_id_all_avail) { + } else if (err == EINVAL && top->sample_id_all_avail) { /* * Old kernel, no attr->sample_id_type_all field */ - sample_id_all_avail = false; + top->sample_id_all_avail = false; goto retry_sample_id; } /* @@ -920,7 +900,7 @@ try_again: } } - if (perf_evlist__mmap(evlist, mmap_pages, false) < 0) { + if (perf_evlist__mmap(evlist, top->mmap_pages, false) < 0) { ui__warning("Failed to mmap with %d (%s)\n", errno, strerror(errno)); goto out_err; @@ -933,14 +913,14 @@ out_err: exit(0); } -static int setup_sample_type(void) +static int perf_top__setup_sample_type(struct perf_top *top) { - if (!sort_has_symbols) { + if (!top->sort_has_symbols) { if (symbol_conf.use_callchain) { ui__warning("Selected -g but \"sym\" not present in --sort/-s."); return -EINVAL; } - } else if (!dont_use_callchains && callchain_param.mode != CHAIN_NONE) { + } else if (!top->dont_use_callchains && callchain_param.mode != CHAIN_NONE) { if (callchain_register_param(&callchain_param) < 0) { ui__warning("Can't register callchain params.\n"); return -EINVAL; @@ -950,7 +930,7 @@ static int setup_sample_type(void) return 0; } -static int __cmd_top(void) +static int __cmd_top(struct perf_top *top) { pthread_t thread; int ret; @@ -958,39 +938,40 @@ static int __cmd_top(void) * FIXME: perf_session__new should allow passing a O_MMAP, so that all this * mmap reading, etc is encapsulated in it. Use O_WRONLY for now. */ - top.session = perf_session__new(NULL, O_WRONLY, false, false, NULL); - if (top.session == NULL) + top->session = perf_session__new(NULL, O_WRONLY, false, false, NULL); + if (top->session == NULL) return -ENOMEM; - ret = setup_sample_type(); + ret = perf_top__setup_sample_type(top); if (ret) goto out_delete; - if (top.target_tid != -1) - perf_event__synthesize_thread_map(top.evlist->threads, - perf_event__process, top.session); + if (top->target_tid != -1) + perf_event__synthesize_thread_map(&top->tool, top->evlist->threads, + perf_event__process, + &top->session->host_machine); else - perf_event__synthesize_threads(perf_event__process, top.session); - - start_counters(top.evlist); - top.session->evlist = top.evlist; - perf_session__update_sample_type(top.session); + perf_event__synthesize_threads(&top->tool, perf_event__process, + &top->session->host_machine); + perf_top__start_counters(top); + top->session->evlist = top->evlist; + perf_session__update_sample_type(top->session); /* Wait for a minimal set of events before starting the snapshot */ - poll(top.evlist->pollfd, top.evlist->nr_fds, 100); + poll(top->evlist->pollfd, top->evlist->nr_fds, 100); - perf_session__mmap_read(top.session); + perf_top__mmap_read(top); if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui : - display_thread), NULL)) { + display_thread), top)) { printf("Could not create display thread.\n"); exit(-1); } - if (realtime_prio) { + if (top->realtime_prio) { struct sched_param param; - param.sched_priority = realtime_prio; + param.sched_priority = top->realtime_prio; if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { printf("Could not set realtime priority.\n"); exit(-1); @@ -998,25 +979,25 @@ static int __cmd_top(void) } while (1) { - u64 hits = top.samples; + u64 hits = top->samples; - perf_session__mmap_read(top.session); + perf_top__mmap_read(top); - if (hits == top.samples) - ret = poll(top.evlist->pollfd, top.evlist->nr_fds, 100); + if (hits == top->samples) + ret = poll(top->evlist->pollfd, top->evlist->nr_fds, 100); } out_delete: - perf_session__delete(top.session); - top.session = NULL; + perf_session__delete(top->session); + top->session = NULL; return 0; } static int -parse_callchain_opt(const struct option *opt __used, const char *arg, - int unset) +parse_callchain_opt(const struct option *opt, const char *arg, int unset) { + struct perf_top *top = (struct perf_top *)opt->value; char *tok, *tok2; char *endptr; @@ -1024,7 +1005,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg, * --no-call-graph */ if (unset) { - dont_use_callchains = true; + top->dont_use_callchains = true; return 0; } @@ -1052,9 +1033,7 @@ parse_callchain_opt(const struct option *opt __used, const char *arg, symbol_conf.use_callchain = false; return 0; - } - - else + } else return -1; /* get the min percentage */ @@ -1098,17 +1077,32 @@ static const char * const top_usage[] = { NULL }; -static const struct option options[] = { +int cmd_top(int argc, const char **argv, const char *prefix __used) +{ + struct perf_evsel *pos; + int status = -ENOMEM; + struct perf_top top = { + .count_filter = 5, + .delay_secs = 2, + .target_pid = -1, + .target_tid = -1, + .freq = 1000, /* 1 KHz */ + .sample_id_all_avail = true, + .mmap_pages = 128, + .sym_pcnt_filter = 5, + }; + char callchain_default_opt[] = "fractal,0.5,callee"; + const struct option options[] = { OPT_CALLBACK('e', "event", &top.evlist, "event", "event selector. use 'perf list' to list available events", parse_events_option), - OPT_INTEGER('c', "count", &default_interval, + OPT_INTEGER('c', "count", &top.default_interval, "event period to sample"), OPT_INTEGER('p', "pid", &top.target_pid, "profile events on existing process id"), OPT_INTEGER('t', "tid", &top.target_tid, "profile events on existing thread id"), - OPT_BOOLEAN('a', "all-cpus", &system_wide, + OPT_BOOLEAN('a', "all-cpus", &top.system_wide, "system-wide collection from all CPUs"), OPT_STRING('C', "cpu", &top.cpu_list, "cpu", "list of cpus to monitor"), @@ -1116,20 +1110,20 @@ static const struct option options[] = { "file", "vmlinux pathname"), OPT_BOOLEAN('K', "hide_kernel_symbols", &top.hide_kernel_symbols, "hide kernel symbols"), - OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"), - OPT_INTEGER('r', "realtime", &realtime_prio, + OPT_UINTEGER('m', "mmap-pages", &top.mmap_pages, "number of mmap data pages"), + OPT_INTEGER('r', "realtime", &top.realtime_prio, "collect data with this RT SCHED_FIFO priority"), OPT_INTEGER('d', "delay", &top.delay_secs, "number of seconds to delay between refreshes"), - OPT_BOOLEAN('D', "dump-symtab", &dump_symtab, + OPT_BOOLEAN('D', "dump-symtab", &top.dump_symtab, "dump the symbol table used for profiling"), OPT_INTEGER('f', "count-filter", &top.count_filter, "only display functions with more events than this"), - OPT_BOOLEAN('g', "group", &group, + OPT_BOOLEAN('g', "group", &top.group, "put the counters into a counter group"), - OPT_BOOLEAN('i', "inherit", &inherit, + OPT_BOOLEAN('i', "inherit", &top.inherit, "child tasks inherit counters"), - OPT_STRING(0, "sym-annotate", &sym_filter, "symbol name", + OPT_STRING(0, "sym-annotate", &top.sym_filter, "symbol name", "symbol to annotate"), OPT_BOOLEAN('z', "zero", &top.zero, "zero history across updates"), @@ -1139,15 +1133,15 @@ static const struct option options[] = { "display this many functions"), OPT_BOOLEAN('U', "hide_user_symbols", &top.hide_user_symbols, "hide user symbols"), - OPT_BOOLEAN(0, "tui", &use_tui, "Use the TUI interface"), - OPT_BOOLEAN(0, "stdio", &use_stdio, "Use the stdio interface"), + OPT_BOOLEAN(0, "tui", &top.use_tui, "Use the TUI interface"), + OPT_BOOLEAN(0, "stdio", &top.use_stdio, "Use the stdio interface"), OPT_INCR('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), OPT_STRING('s', "sort", &sort_order, "key[,key2...]", "sort by key(s): pid, comm, dso, symbol, parent"), OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples, "Show a column with the number of samples"), - OPT_CALLBACK_DEFAULT('G', "call-graph", NULL, "output_type,min_percent, call_order", + OPT_CALLBACK_DEFAULT('G', "call-graph", &top, "output_type,min_percent, call_order", "Display callchains using output_type (graph, flat, fractal, or none), min percent threshold and callchain order. " "Default: fractal,0.5,callee", &parse_callchain_opt, callchain_default_opt), @@ -1166,12 +1160,7 @@ static const struct option options[] = { OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style", "Specify disassembler style (e.g. -M intel for intel syntax)"), OPT_END() -}; - -int cmd_top(int argc, const char **argv, const char *prefix __used) -{ - struct perf_evsel *pos; - int status = -ENOMEM; + }; top.evlist = perf_evlist__new(NULL, NULL); if (top.evlist == NULL) @@ -1188,9 +1177,9 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) setup_sorting(top_usage, options); - if (use_stdio) + if (top.use_stdio) use_browser = 0; - else if (use_tui) + else if (top.use_tui) use_browser = 1; setup_browser(false); @@ -1215,38 +1204,31 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) return -ENOMEM; } + symbol_conf.nr_events = top.evlist->nr_entries; + if (top.delay_secs < 1) top.delay_secs = 1; /* * User specified count overrides default frequency. */ - if (default_interval) + if (top.default_interval) top.freq = 0; else if (top.freq) { - default_interval = top.freq; + top.default_interval = top.freq; } else { fprintf(stderr, "frequency and count are zero, aborting\n"); exit(EXIT_FAILURE); } list_for_each_entry(pos, &top.evlist->entries, node) { - if (perf_evsel__alloc_fd(pos, top.evlist->cpus->nr, - top.evlist->threads->nr) < 0) - goto out_free_fd; /* * Fill in the ones not specifically initialized via -c: */ - if (pos->attr.sample_period) - continue; - - pos->attr.sample_period = default_interval; + if (!pos->attr.sample_period) + pos->attr.sample_period = top.default_interval; } - if (perf_evlist__alloc_pollfd(top.evlist) < 0 || - perf_evlist__alloc_mmap(top.evlist) < 0) - goto out_free_fd; - top.sym_evsel = list_entry(top.evlist->entries.next, struct perf_evsel, node); symbol_conf.priv_size = sizeof(struct annotation); @@ -1263,16 +1245,20 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) * Avoid annotation data structures overhead when symbols aren't on the * sort list. */ - sort_has_symbols = sort_sym.list.next != NULL; + top.sort_has_symbols = sort_sym.list.next != NULL; - get_term_dimensions(&winsize); + get_term_dimensions(&top.winsize); if (top.print_entries == 0) { - update_print_entries(&winsize); - signal(SIGWINCH, sig_winch_handler); + struct sigaction act = { + .sa_sigaction = perf_top__sig_winch, + .sa_flags = SA_SIGINFO, + }; + perf_top__update_print_entries(&top); + sigaction(SIGWINCH, &act, NULL); } - status = __cmd_top(); -out_free_fd: + status = __cmd_top(&top); + perf_evlist__delete(top.evlist); return status; diff --git a/tools/perf/perf.c b/tools/perf/perf.c index 73d0cac8b67..2b2e225a4d4 100644 --- a/tools/perf/perf.c +++ b/tools/perf/perf.c @@ -29,8 +29,6 @@ struct pager_config { int val; }; -static char debugfs_mntpt[MAXPATHLEN]; - static int pager_command_config(const char *var, const char *value, void *data) { struct pager_config *c = data; @@ -81,15 +79,6 @@ static void commit_pager_choice(void) } } -static void set_debugfs_path(void) -{ - char *path; - - path = getenv(PERF_DEBUGFS_ENVIRONMENT); - snprintf(debugfs_path, MAXPATHLEN, "%s/%s", path ?: debugfs_mntpt, - "tracing/events"); -} - static int handle_options(const char ***argv, int *argc, int *envchanged) { int handled = 0; @@ -161,15 +150,14 @@ static int handle_options(const char ***argv, int *argc, int *envchanged) fprintf(stderr, "No directory given for --debugfs-dir.\n"); usage(perf_usage_string); } - strncpy(debugfs_mntpt, (*argv)[1], MAXPATHLEN); - debugfs_mntpt[MAXPATHLEN - 1] = '\0'; + debugfs_set_path((*argv)[1]); if (envchanged) *envchanged = 1; (*argv)++; (*argc)--; } else if (!prefixcmp(cmd, CMD_DEBUGFS_DIR)) { - strncpy(debugfs_mntpt, cmd + strlen(CMD_DEBUGFS_DIR), MAXPATHLEN); - debugfs_mntpt[MAXPATHLEN - 1] = '\0'; + debugfs_set_path(cmd + strlen(CMD_DEBUGFS_DIR)); + fprintf(stderr, "dir: %s\n", debugfs_mountpoint); if (envchanged) *envchanged = 1; } else { @@ -281,7 +269,6 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv) if (use_pager == -1 && p->option & USE_PAGER) use_pager = 1; commit_pager_choice(); - set_debugfs_path(); status = p->fn(argc, argv, prefix); exit_browser(status); @@ -416,17 +403,6 @@ static int run_argv(int *argcp, const char ***argv) return done_alias; } -/* mini /proc/mounts parser: searching for "^blah /mount/point debugfs" */ -static void get_debugfs_mntpt(void) -{ - const char *path = debugfs_mount(NULL); - - if (path) - strncpy(debugfs_mntpt, path, sizeof(debugfs_mntpt)); - else - debugfs_mntpt[0] = '\0'; -} - static void pthread__block_sigwinch(void) { sigset_t set; @@ -453,7 +429,7 @@ int main(int argc, const char **argv) if (!cmd) cmd = "perf-help"; /* get debugfs mount point from /proc/mounts */ - get_debugfs_mntpt(); + debugfs_mount(NULL); /* * "perf-xxxx" is the same as "perf xxxx", but we obviously: * @@ -476,7 +452,6 @@ int main(int argc, const char **argv) argc--; handle_options(&argv, &argc, NULL); commit_pager_choice(); - set_debugfs_path(); set_buildid_dir(); if (argc > 0) { diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 914c895510f..64f8bee31ce 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -185,4 +185,28 @@ extern const char perf_version_string[]; void pthread__unblock_sigwinch(void); +struct perf_record_opts { + pid_t target_pid; + pid_t target_tid; + bool call_graph; + bool group; + bool inherit_stat; + bool no_delay; + bool no_inherit; + bool no_samples; + bool pipe_output; + bool raw_samples; + bool sample_address; + bool sample_time; + bool sample_id_all_avail; + bool system_wide; + bool period; + unsigned int freq; + unsigned int mmap_pages; + unsigned int user_freq; + u64 default_interval; + u64 user_interval; + const char *cpu_list; +}; + #endif diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 119e996035c..011ed267660 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -25,17 +25,17 @@ int symbol__annotate_init(struct map *map __used, struct symbol *sym) return 0; } -int symbol__alloc_hist(struct symbol *sym, int nevents) +int symbol__alloc_hist(struct symbol *sym) { struct annotation *notes = symbol__annotation(sym); size_t sizeof_sym_hist = (sizeof(struct sym_hist) + (sym->end - sym->start) * sizeof(u64)); - notes->src = zalloc(sizeof(*notes->src) + nevents * sizeof_sym_hist); + notes->src = zalloc(sizeof(*notes->src) + symbol_conf.nr_events * sizeof_sym_hist); if (notes->src == NULL) return -1; notes->src->sizeof_sym_hist = sizeof_sym_hist; - notes->src->nr_histograms = nevents; + notes->src->nr_histograms = symbol_conf.nr_events; INIT_LIST_HEAD(¬es->src->source); return 0; } @@ -334,7 +334,7 @@ fallback: disassembler_style ? "-M " : "", disassembler_style ? disassembler_style : "", map__rip_2objdump(map, sym->start), - map__rip_2objdump(map, sym->end), + map__rip_2objdump(map, sym->end+1), symbol_conf.annotate_asm_raw ? "" : "--no-show-raw", symbol_conf.annotate_src ? "-S" : "", symfs_filename, filename); diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index d9072523d34..efa5dc82bfa 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -72,7 +72,7 @@ static inline struct annotation *symbol__annotation(struct symbol *sym) int symbol__inc_addr_samples(struct symbol *sym, struct map *map, int evidx, u64 addr); -int symbol__alloc_hist(struct symbol *sym, int nevents); +int symbol__alloc_hist(struct symbol *sym); void symbol__annotate_zero_histograms(struct symbol *sym); int symbol__annotate(struct symbol *sym, struct map *map, size_t privsize); @@ -99,8 +99,7 @@ static inline int symbol__tui_annotate(struct symbol *sym __used, } #else int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx, - int nr_events, void(*timer)(void *arg), void *arg, - int delay_secs); + void(*timer)(void *arg), void *arg, int delay_secs); #endif extern const char *disassembler_style; diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index a91cd99f26e..dff9c7a725f 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -13,15 +13,18 @@ #include "symbol.h" #include <linux/kernel.h> #include "debug.h" +#include "session.h" +#include "tool.h" -static int build_id__mark_dso_hit(union perf_event *event, +static int build_id__mark_dso_hit(struct perf_tool *tool __used, + union perf_event *event, struct perf_sample *sample __used, struct perf_evsel *evsel __used, - struct perf_session *session) + struct machine *machine) { struct addr_location al; u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - struct thread *thread = perf_session__findnew(session, event->ip.pid); + struct thread *thread = machine__findnew_thread(machine, event->ip.pid); if (thread == NULL) { pr_err("problem processing %d event, skipping it.\n", @@ -29,8 +32,8 @@ static int build_id__mark_dso_hit(union perf_event *event, return -1; } - thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION, - event->ip.pid, event->ip.ip, &al); + thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION, + event->ip.ip, &al); if (al.map != NULL) al.map->dso->hit = 1; @@ -38,25 +41,26 @@ static int build_id__mark_dso_hit(union perf_event *event, return 0; } -static int perf_event__exit_del_thread(union perf_event *event, +static int perf_event__exit_del_thread(struct perf_tool *tool __used, + union perf_event *event, struct perf_sample *sample __used, - struct perf_session *session) + struct machine *machine) { - struct thread *thread = perf_session__findnew(session, event->fork.tid); + struct thread *thread = machine__findnew_thread(machine, event->fork.tid); dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid, event->fork.ppid, event->fork.ptid); if (thread) { - rb_erase(&thread->rb_node, &session->threads); - session->last_match = NULL; + rb_erase(&thread->rb_node, &machine->threads); + machine->last_match = NULL; thread__delete(thread); } return 0; } -struct perf_event_ops build_id__mark_dso_hit_ops = { +struct perf_tool build_id__mark_dso_hit_ops = { .sample = build_id__mark_dso_hit, .mmap = perf_event__process_mmap, .fork = perf_event__process_task, diff --git a/tools/perf/util/build-id.h b/tools/perf/util/build-id.h index 5dafb00eaa0..a993ba87d99 100644 --- a/tools/perf/util/build-id.h +++ b/tools/perf/util/build-id.h @@ -3,7 +3,7 @@ #include "session.h" -extern struct perf_event_ops build_id__mark_dso_hit_ops; +extern struct perf_tool build_id__mark_dso_hit_ops; char *dso__build_id_filename(struct dso *self, char *bf, size_t size); diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 9b4ff16cac9..7f9c0f1ae3a 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -101,6 +101,9 @@ int callchain_append(struct callchain_root *root, int callchain_merge(struct callchain_cursor *cursor, struct callchain_root *dst, struct callchain_root *src); +struct ip_callchain; +union perf_event; + bool ip_callchain__valid(struct ip_callchain *chain, const union perf_event *event); /* diff --git a/tools/perf/util/cgroup.c b/tools/perf/util/cgroup.c index 96bee5c4600..dbe2f16b1a1 100644 --- a/tools/perf/util/cgroup.c +++ b/tools/perf/util/cgroup.c @@ -3,7 +3,6 @@ #include "parse-options.h" #include "evsel.h" #include "cgroup.h" -#include "debugfs.h" /* MAX_PATH, STR() */ #include "evlist.h" int nr_cgroups; @@ -12,7 +11,7 @@ static int cgroupfs_find_mountpoint(char *buf, size_t maxlen) { FILE *fp; - char mountpoint[MAX_PATH+1], tokens[MAX_PATH+1], type[MAX_PATH+1]; + char mountpoint[PATH_MAX + 1], tokens[PATH_MAX + 1], type[PATH_MAX + 1]; char *token, *saved_ptr = NULL; int found = 0; @@ -25,8 +24,8 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen) * and inspect every cgroupfs mount point to find one that has * perf_event subsystem */ - while (fscanf(fp, "%*s %"STR(MAX_PATH)"s %"STR(MAX_PATH)"s %" - STR(MAX_PATH)"s %*d %*d\n", + while (fscanf(fp, "%*s %"STR(PATH_MAX)"s %"STR(PATH_MAX)"s %" + STR(PATH_MAX)"s %*d %*d\n", mountpoint, type, tokens) == 3) { if (!strcmp(type, "cgroup")) { @@ -57,15 +56,15 @@ cgroupfs_find_mountpoint(char *buf, size_t maxlen) static int open_cgroup(char *name) { - char path[MAX_PATH+1]; - char mnt[MAX_PATH+1]; + char path[PATH_MAX + 1]; + char mnt[PATH_MAX + 1]; int fd; - if (cgroupfs_find_mountpoint(mnt, MAX_PATH+1)) + if (cgroupfs_find_mountpoint(mnt, PATH_MAX + 1)) return -1; - snprintf(path, MAX_PATH, "%s/%s", mnt, name); + snprintf(path, PATH_MAX, "%s/%s", mnt, name); fd = open(path, O_RDONLY); if (fd == -1) diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index 80d9598db31..0deac6a14b6 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -1,5 +1,8 @@ /* - * GIT - The information manager from hell + * config.c + * + * Helper functions for parsing config items. + * Originally copied from GIT source. * * Copyright (C) Linus Torvalds, 2005 * Copyright (C) Johannes Schindelin, 2005 diff --git a/tools/perf/util/debugfs.c b/tools/perf/util/debugfs.c index a88fefc0cc0..ffc35e748e8 100644 --- a/tools/perf/util/debugfs.c +++ b/tools/perf/util/debugfs.c @@ -2,8 +2,12 @@ #include "debugfs.h" #include "cache.h" +#include <linux/kernel.h> +#include <sys/mount.h> + static int debugfs_premounted; -static char debugfs_mountpoint[MAX_PATH+1]; +char debugfs_mountpoint[PATH_MAX + 1] = "/sys/kernel/debug"; +char tracing_events_path[PATH_MAX + 1] = "/sys/kernel/debug/tracing/events"; static const char *debugfs_known_mountpoints[] = { "/sys/kernel/debug/", @@ -62,11 +66,9 @@ const char *debugfs_find_mountpoint(void) /* give up and parse /proc/mounts */ fp = fopen("/proc/mounts", "r"); if (fp == NULL) - die("Can't open /proc/mounts for read"); + return NULL; - while (fscanf(fp, "%*s %" - STR(MAX_PATH) - "s %99s %*s %*d %*d\n", + while (fscanf(fp, "%*s %" STR(PATH_MAX) "s %99s %*s %*d %*d\n", debugfs_mountpoint, type) == 2) { if (strcmp(type, "debugfs") == 0) break; @@ -106,6 +108,12 @@ int debugfs_valid_entry(const char *path) return 0; } +static void debugfs_set_tracing_events_path(const char *mountpoint) +{ + snprintf(tracing_events_path, sizeof(tracing_events_path), "%s/%s", + mountpoint, "tracing/events"); +} + /* mount the debugfs somewhere if it's not mounted */ char *debugfs_mount(const char *mountpoint) @@ -113,7 +121,7 @@ char *debugfs_mount(const char *mountpoint) /* see if it's already mounted */ if (debugfs_find_mountpoint()) { debugfs_premounted = 1; - return debugfs_mountpoint; + goto out; } /* if not mounted and no argument */ @@ -129,12 +137,19 @@ char *debugfs_mount(const char *mountpoint) return NULL; /* save the mountpoint */ - strncpy(debugfs_mountpoint, mountpoint, sizeof(debugfs_mountpoint)); debugfs_found = 1; - + strncpy(debugfs_mountpoint, mountpoint, sizeof(debugfs_mountpoint)); +out: + debugfs_set_tracing_events_path(debugfs_mountpoint); return debugfs_mountpoint; } +void debugfs_set_path(const char *mountpoint) +{ + snprintf(debugfs_mountpoint, sizeof(debugfs_mountpoint), "%s", mountpoint); + debugfs_set_tracing_events_path(mountpoint); +} + /* umount the debugfs */ int debugfs_umount(void) @@ -158,7 +173,7 @@ int debugfs_umount(void) int debugfs_write(const char *entry, const char *value) { - char path[MAX_PATH+1]; + char path[PATH_MAX + 1]; int ret, count; int fd; @@ -203,7 +218,7 @@ int debugfs_write(const char *entry, const char *value) */ int debugfs_read(const char *entry, char *buffer, size_t size) { - char path[MAX_PATH+1]; + char path[PATH_MAX + 1]; int ret; int fd; diff --git a/tools/perf/util/debugfs.h b/tools/perf/util/debugfs.h index 83a02879745..4a878f735eb 100644 --- a/tools/perf/util/debugfs.h +++ b/tools/perf/util/debugfs.h @@ -1,25 +1,18 @@ #ifndef __DEBUGFS_H__ #define __DEBUGFS_H__ -#include <sys/mount.h> +const char *debugfs_find_mountpoint(void); +int debugfs_valid_mountpoint(const char *debugfs); +int debugfs_valid_entry(const char *path); +char *debugfs_mount(const char *mountpoint); +int debugfs_umount(void); +void debugfs_set_path(const char *mountpoint); +int debugfs_write(const char *entry, const char *value); +int debugfs_read(const char *entry, char *buffer, size_t size); +void debugfs_force_cleanup(void); +int debugfs_make_path(const char *element, char *buffer, int size); -#ifndef MAX_PATH -# define MAX_PATH 256 -#endif - -#ifndef STR -# define _STR(x) #x -# define STR(x) _STR(x) -#endif - -extern const char *debugfs_find_mountpoint(void); -extern int debugfs_valid_mountpoint(const char *debugfs); -extern int debugfs_valid_entry(const char *path); -extern char *debugfs_mount(const char *mountpoint); -extern int debugfs_umount(void); -extern int debugfs_write(const char *entry, const char *value); -extern int debugfs_read(const char *entry, char *buffer, size_t size); -extern void debugfs_force_cleanup(void); -extern int debugfs_make_path(const char *element, char *buffer, int size); +extern char debugfs_mountpoint[]; +extern char tracing_events_path[]; #endif /* __DEBUGFS_H__ */ diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 437f8ca679a..73ddaf06b8e 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -1,7 +1,6 @@ #include <linux/types.h> #include "event.h" #include "debug.h" -#include "session.h" #include "sort.h" #include "string.h" #include "strlist.h" @@ -44,36 +43,27 @@ static struct perf_sample synth_sample = { .period = 1, }; -static pid_t perf_event__synthesize_comm(union perf_event *event, pid_t pid, - int full, perf_event__handler_t process, - struct perf_session *session) +static pid_t perf_event__get_comm_tgid(pid_t pid, char *comm, size_t len) { char filename[PATH_MAX]; char bf[BUFSIZ]; FILE *fp; size_t size = 0; - DIR *tasks; - struct dirent dirent, *next; - pid_t tgid = 0; + pid_t tgid = -1; snprintf(filename, sizeof(filename), "/proc/%d/status", pid); fp = fopen(filename, "r"); if (fp == NULL) { -out_race: - /* - * We raced with a task exiting - just return: - */ pr_debug("couldn't open %s\n", filename); return 0; } - memset(&event->comm, 0, sizeof(event->comm)); - - while (!event->comm.comm[0] || !event->comm.pid) { + while (!comm[0] || (tgid < 0)) { if (fgets(bf, sizeof(bf), fp) == NULL) { - pr_warning("couldn't get COMM and pgid, malformed %s\n", filename); - goto out; + pr_warning("couldn't get COMM and pgid, malformed %s\n", + filename); + break; } if (memcmp(bf, "Name:", 5) == 0) { @@ -81,33 +71,65 @@ out_race: while (*name && isspace(*name)) ++name; size = strlen(name) - 1; - memcpy(event->comm.comm, name, size++); + if (size >= len) + size = len - 1; + memcpy(comm, name, size); + } else if (memcmp(bf, "Tgid:", 5) == 0) { char *tgids = bf + 5; while (*tgids && isspace(*tgids)) ++tgids; - tgid = event->comm.pid = atoi(tgids); + tgid = atoi(tgids); } } + fclose(fp); + + return tgid; +} + +static pid_t perf_event__synthesize_comm(struct perf_tool *tool, + union perf_event *event, pid_t pid, + int full, + perf_event__handler_t process, + struct machine *machine) +{ + char filename[PATH_MAX]; + size_t size; + DIR *tasks; + struct dirent dirent, *next; + pid_t tgid; + + memset(&event->comm, 0, sizeof(event->comm)); + + tgid = perf_event__get_comm_tgid(pid, event->comm.comm, + sizeof(event->comm.comm)); + if (tgid < 0) + goto out; + + event->comm.pid = tgid; event->comm.header.type = PERF_RECORD_COMM; + + size = strlen(event->comm.comm) + 1; size = ALIGN(size, sizeof(u64)); - memset(event->comm.comm + size, 0, session->id_hdr_size); + memset(event->comm.comm + size, 0, machine->id_hdr_size); event->comm.header.size = (sizeof(event->comm) - (sizeof(event->comm.comm) - size) + - session->id_hdr_size); + machine->id_hdr_size); if (!full) { event->comm.tid = pid; - process(event, &synth_sample, session); + process(tool, event, &synth_sample, machine); goto out; } snprintf(filename, sizeof(filename), "/proc/%d/task", pid); tasks = opendir(filename); - if (tasks == NULL) - goto out_race; + if (tasks == NULL) { + pr_debug("couldn't open %s\n", filename); + return 0; + } while (!readdir_r(tasks, &dirent, &next) && next) { char *end; @@ -115,22 +137,32 @@ out_race: if (*end) continue; + /* already have tgid; jut want to update the comm */ + (void) perf_event__get_comm_tgid(pid, event->comm.comm, + sizeof(event->comm.comm)); + + size = strlen(event->comm.comm) + 1; + size = ALIGN(size, sizeof(u64)); + memset(event->comm.comm + size, 0, machine->id_hdr_size); + event->comm.header.size = (sizeof(event->comm) - + (sizeof(event->comm.comm) - size) + + machine->id_hdr_size); + event->comm.tid = pid; - process(event, &synth_sample, session); + process(tool, event, &synth_sample, machine); } closedir(tasks); out: - fclose(fp); - return tgid; } -static int perf_event__synthesize_mmap_events(union perf_event *event, +static int perf_event__synthesize_mmap_events(struct perf_tool *tool, + union perf_event *event, pid_t pid, pid_t tgid, perf_event__handler_t process, - struct perf_session *session) + struct machine *machine) { char filename[PATH_MAX]; FILE *fp; @@ -193,12 +225,12 @@ static int perf_event__synthesize_mmap_events(union perf_event *event, event->mmap.len -= event->mmap.start; event->mmap.header.size = (sizeof(event->mmap) - (sizeof(event->mmap.filename) - size)); - memset(event->mmap.filename + size, 0, session->id_hdr_size); - event->mmap.header.size += session->id_hdr_size; + memset(event->mmap.filename + size, 0, machine->id_hdr_size); + event->mmap.header.size += machine->id_hdr_size; event->mmap.pid = tgid; event->mmap.tid = pid; - process(event, &synth_sample, session); + process(tool, event, &synth_sample, machine); } } @@ -206,14 +238,14 @@ static int perf_event__synthesize_mmap_events(union perf_event *event, return 0; } -int perf_event__synthesize_modules(perf_event__handler_t process, - struct perf_session *session, +int perf_event__synthesize_modules(struct perf_tool *tool, + perf_event__handler_t process, struct machine *machine) { struct rb_node *nd; struct map_groups *kmaps = &machine->kmaps; union perf_event *event = zalloc((sizeof(event->mmap) + - session->id_hdr_size)); + machine->id_hdr_size)); if (event == NULL) { pr_debug("Not enough memory synthesizing mmap event " "for kernel modules\n"); @@ -243,15 +275,15 @@ int perf_event__synthesize_modules(perf_event__handler_t process, event->mmap.header.type = PERF_RECORD_MMAP; event->mmap.header.size = (sizeof(event->mmap) - (sizeof(event->mmap.filename) - size)); - memset(event->mmap.filename + size, 0, session->id_hdr_size); - event->mmap.header.size += session->id_hdr_size; + memset(event->mmap.filename + size, 0, machine->id_hdr_size); + event->mmap.header.size += machine->id_hdr_size; event->mmap.start = pos->start; event->mmap.len = pos->end - pos->start; event->mmap.pid = machine->pid; memcpy(event->mmap.filename, pos->dso->long_name, pos->dso->long_name_len + 1); - process(event, &synth_sample, session); + process(tool, event, &synth_sample, machine); } free(event); @@ -260,40 +292,69 @@ int perf_event__synthesize_modules(perf_event__handler_t process, static int __event__synthesize_thread(union perf_event *comm_event, union perf_event *mmap_event, - pid_t pid, perf_event__handler_t process, - struct perf_session *session) + pid_t pid, int full, + perf_event__handler_t process, + struct perf_tool *tool, + struct machine *machine) { - pid_t tgid = perf_event__synthesize_comm(comm_event, pid, 1, process, - session); + pid_t tgid = perf_event__synthesize_comm(tool, comm_event, pid, full, + process, machine); if (tgid == -1) return -1; - return perf_event__synthesize_mmap_events(mmap_event, pid, tgid, - process, session); + return perf_event__synthesize_mmap_events(tool, mmap_event, pid, tgid, + process, machine); } -int perf_event__synthesize_thread_map(struct thread_map *threads, +int perf_event__synthesize_thread_map(struct perf_tool *tool, + struct thread_map *threads, perf_event__handler_t process, - struct perf_session *session) + struct machine *machine) { union perf_event *comm_event, *mmap_event; - int err = -1, thread; + int err = -1, thread, j; - comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size); + comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size); if (comm_event == NULL) goto out; - mmap_event = malloc(sizeof(mmap_event->mmap) + session->id_hdr_size); + mmap_event = malloc(sizeof(mmap_event->mmap) + machine->id_hdr_size); if (mmap_event == NULL) goto out_free_comm; err = 0; for (thread = 0; thread < threads->nr; ++thread) { if (__event__synthesize_thread(comm_event, mmap_event, - threads->map[thread], - process, session)) { + threads->map[thread], 0, + process, tool, machine)) { err = -1; break; } + + /* + * comm.pid is set to thread group id by + * perf_event__synthesize_comm + */ + if ((int) comm_event->comm.pid != threads->map[thread]) { + bool need_leader = true; + + /* is thread group leader in thread_map? */ + for (j = 0; j < threads->nr; ++j) { + if ((int) comm_event->comm.pid == threads->map[j]) { + need_leader = false; + break; + } + } + + /* if not, generate events for it */ + if (need_leader && + __event__synthesize_thread(comm_event, + mmap_event, + comm_event->comm.pid, 0, + process, tool, machine)) { + err = -1; + break; + } + } } free(mmap_event); out_free_comm: @@ -302,19 +363,20 @@ out: return err; } -int perf_event__synthesize_threads(perf_event__handler_t process, - struct perf_session *session) +int perf_event__synthesize_threads(struct perf_tool *tool, + perf_event__handler_t process, + struct machine *machine) { DIR *proc; struct dirent dirent, *next; union perf_event *comm_event, *mmap_event; int err = -1; - comm_event = malloc(sizeof(comm_event->comm) + session->id_hdr_size); + comm_event = malloc(sizeof(comm_event->comm) + machine->id_hdr_size); if (comm_event == NULL) goto out; - mmap_event = malloc(sizeof(mmap_event->mmap) + session->id_hdr_size); + mmap_event = malloc(sizeof(mmap_event->mmap) + machine->id_hdr_size); if (mmap_event == NULL) goto out_free_comm; @@ -329,8 +391,8 @@ int perf_event__synthesize_threads(perf_event__handler_t process, if (*end) /* only interested in proper numerical dirents */ continue; - __event__synthesize_thread(comm_event, mmap_event, pid, - process, session); + __event__synthesize_thread(comm_event, mmap_event, pid, 1, + process, tool, machine); } closedir(proc); @@ -365,8 +427,8 @@ static int find_symbol_cb(void *arg, const char *name, char type, return 1; } -int perf_event__synthesize_kernel_mmap(perf_event__handler_t process, - struct perf_session *session, +int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, + perf_event__handler_t process, struct machine *machine, const char *symbol_name) { @@ -383,7 +445,7 @@ int perf_event__synthesize_kernel_mmap(perf_event__handler_t process, */ struct process_symbol_args args = { .name = symbol_name, }; union perf_event *event = zalloc((sizeof(event->mmap) + - session->id_hdr_size)); + machine->id_hdr_size)); if (event == NULL) { pr_debug("Not enough memory synthesizing mmap event " "for kernel modules\n"); @@ -417,25 +479,32 @@ int perf_event__synthesize_kernel_mmap(perf_event__handler_t process, size = ALIGN(size, sizeof(u64)); event->mmap.header.type = PERF_RECORD_MMAP; event->mmap.header.size = (sizeof(event->mmap) - - (sizeof(event->mmap.filename) - size) + session->id_hdr_size); + (sizeof(event->mmap.filename) - size) + machine->id_hdr_size); event->mmap.pgoff = args.start; event->mmap.start = map->start; event->mmap.len = map->end - event->mmap.start; event->mmap.pid = machine->pid; - err = process(event, &synth_sample, session); + err = process(tool, event, &synth_sample, machine); free(event); return err; } -int perf_event__process_comm(union perf_event *event, +size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp) +{ + return fprintf(fp, ": %s:%d\n", event->comm.comm, event->comm.tid); +} + +int perf_event__process_comm(struct perf_tool *tool __used, + union perf_event *event, struct perf_sample *sample __used, - struct perf_session *session) + struct machine *machine) { - struct thread *thread = perf_session__findnew(session, event->comm.tid); + struct thread *thread = machine__findnew_thread(machine, event->comm.tid); - dump_printf(": %s:%d\n", event->comm.comm, event->comm.tid); + if (dump_trace) + perf_event__fprintf_comm(event, stdout); if (thread == NULL || thread__set_comm(thread, event->comm.comm)) { dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); @@ -445,13 +514,13 @@ int perf_event__process_comm(union perf_event *event, return 0; } -int perf_event__process_lost(union perf_event *event, +int perf_event__process_lost(struct perf_tool *tool __used, + union perf_event *event, struct perf_sample *sample __used, - struct perf_session *session) + struct machine *machine __used) { dump_printf(": id:%" PRIu64 ": lost:%" PRIu64 "\n", event->lost.id, event->lost.lost); - session->hists.stats.total_lost += event->lost.lost; return 0; } @@ -468,21 +537,15 @@ static void perf_event__set_kernel_mmap_len(union perf_event *event, maps[MAP__FUNCTION]->end = ~0ULL; } -static int perf_event__process_kernel_mmap(union perf_event *event, - struct perf_session *session) +static int perf_event__process_kernel_mmap(struct perf_tool *tool __used, + union perf_event *event, + struct machine *machine) { struct map *map; char kmmap_prefix[PATH_MAX]; - struct machine *machine; enum dso_kernel_type kernel_type; bool is_kernel_mmap; - machine = perf_session__findnew_machine(session, event->mmap.pid); - if (!machine) { - pr_err("Can't find id %d's machine\n", event->mmap.pid); - goto out_problem; - } - machine__mmap_name(machine, kmmap_prefix, sizeof(kmmap_prefix)); if (machine__is_host(machine)) kernel_type = DSO_TYPE_KERNEL; @@ -549,9 +612,9 @@ static int perf_event__process_kernel_mmap(union perf_event *event, * time /proc/sys/kernel/kptr_restrict was non zero. */ if (event->mmap.pgoff != 0) { - perf_session__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps, - symbol_name, - event->mmap.pgoff); + maps__set_kallsyms_ref_reloc_sym(machine->vmlinux_maps, + symbol_name, + event->mmap.pgoff); } if (machine__is_default_guest(machine)) { @@ -567,32 +630,35 @@ out_problem: return -1; } -int perf_event__process_mmap(union perf_event *event, +size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp) +{ + return fprintf(fp, " %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %s\n", + event->mmap.pid, event->mmap.tid, event->mmap.start, + event->mmap.len, event->mmap.pgoff, event->mmap.filename); +} + +int perf_event__process_mmap(struct perf_tool *tool, + union perf_event *event, struct perf_sample *sample __used, - struct perf_session *session) + struct machine *machine) { - struct machine *machine; struct thread *thread; struct map *map; u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; int ret = 0; - dump_printf(" %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %s\n", - event->mmap.pid, event->mmap.tid, event->mmap.start, - event->mmap.len, event->mmap.pgoff, event->mmap.filename); + if (dump_trace) + perf_event__fprintf_mmap(event, stdout); if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL || cpumode == PERF_RECORD_MISC_KERNEL) { - ret = perf_event__process_kernel_mmap(event, session); + ret = perf_event__process_kernel_mmap(tool, event, machine); if (ret < 0) goto out_problem; return 0; } - machine = perf_session__find_host_machine(session); - if (machine == NULL) - goto out_problem; - thread = perf_session__findnew(session, event->mmap.pid); + thread = machine__findnew_thread(machine, event->mmap.pid); if (thread == NULL) goto out_problem; map = map__new(&machine->user_dsos, event->mmap.start, @@ -610,18 +676,26 @@ out_problem: return 0; } -int perf_event__process_task(union perf_event *event, +size_t perf_event__fprintf_task(union perf_event *event, FILE *fp) +{ + return fprintf(fp, "(%d:%d):(%d:%d)\n", + event->fork.pid, event->fork.tid, + event->fork.ppid, event->fork.ptid); +} + +int perf_event__process_task(struct perf_tool *tool __used, + union perf_event *event, struct perf_sample *sample __used, - struct perf_session *session) + struct machine *machine) { - struct thread *thread = perf_session__findnew(session, event->fork.tid); - struct thread *parent = perf_session__findnew(session, event->fork.ptid); + struct thread *thread = machine__findnew_thread(machine, event->fork.tid); + struct thread *parent = machine__findnew_thread(machine, event->fork.ptid); - dump_printf("(%d:%d):(%d:%d)\n", event->fork.pid, event->fork.tid, - event->fork.ppid, event->fork.ptid); + if (dump_trace) + perf_event__fprintf_task(event, stdout); if (event->header.type == PERF_RECORD_EXIT) { - perf_session__remove_thread(session, thread); + machine__remove_thread(machine, thread); return 0; } @@ -634,22 +708,45 @@ int perf_event__process_task(union perf_event *event, return 0; } -int perf_event__process(union perf_event *event, struct perf_sample *sample, - struct perf_session *session) +size_t perf_event__fprintf(union perf_event *event, FILE *fp) +{ + size_t ret = fprintf(fp, "PERF_RECORD_%s", + perf_event__name(event->header.type)); + + switch (event->header.type) { + case PERF_RECORD_COMM: + ret += perf_event__fprintf_comm(event, fp); + break; + case PERF_RECORD_FORK: + case PERF_RECORD_EXIT: + ret += perf_event__fprintf_task(event, fp); + break; + case PERF_RECORD_MMAP: + ret += perf_event__fprintf_mmap(event, fp); + break; + default: + ret += fprintf(fp, "\n"); + } + + return ret; +} + +int perf_event__process(struct perf_tool *tool, union perf_event *event, + struct perf_sample *sample, struct machine *machine) { switch (event->header.type) { case PERF_RECORD_COMM: - perf_event__process_comm(event, sample, session); + perf_event__process_comm(tool, event, sample, machine); break; case PERF_RECORD_MMAP: - perf_event__process_mmap(event, sample, session); + perf_event__process_mmap(tool, event, sample, machine); break; case PERF_RECORD_FORK: case PERF_RECORD_EXIT: - perf_event__process_task(event, sample, session); + perf_event__process_task(tool, event, sample, machine); break; case PERF_RECORD_LOST: - perf_event__process_lost(event, sample, session); + perf_event__process_lost(tool, event, sample, machine); default: break; } @@ -658,36 +755,29 @@ int perf_event__process(union perf_event *event, struct perf_sample *sample, } void thread__find_addr_map(struct thread *self, - struct perf_session *session, u8 cpumode, - enum map_type type, pid_t pid, u64 addr, + struct machine *machine, u8 cpumode, + enum map_type type, u64 addr, struct addr_location *al) { struct map_groups *mg = &self->mg; - struct machine *machine = NULL; al->thread = self; al->addr = addr; al->cpumode = cpumode; al->filtered = false; + if (machine == NULL) { + al->map = NULL; + return; + } + if (cpumode == PERF_RECORD_MISC_KERNEL && perf_host) { al->level = 'k'; - machine = perf_session__find_host_machine(session); - if (machine == NULL) { - al->map = NULL; - return; - } mg = &machine->kmaps; } else if (cpumode == PERF_RECORD_MISC_USER && perf_host) { al->level = '.'; - machine = perf_session__find_host_machine(session); } else if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL && perf_guest) { al->level = 'g'; - machine = perf_session__find_machine(session, pid); - if (machine == NULL) { - al->map = NULL; - return; - } mg = &machine->kmaps; } else { /* @@ -733,13 +823,12 @@ try_again: al->addr = al->map->map_ip(al->map, al->addr); } -void thread__find_addr_location(struct thread *self, - struct perf_session *session, u8 cpumode, - enum map_type type, pid_t pid, u64 addr, +void thread__find_addr_location(struct thread *thread, struct machine *machine, + u8 cpumode, enum map_type type, u64 addr, struct addr_location *al, symbol_filter_t filter) { - thread__find_addr_map(self, session, cpumode, type, pid, addr, al); + thread__find_addr_map(thread, machine, cpumode, type, addr, al); if (al->map != NULL) al->sym = map__find_symbol(al->map, al->addr, filter); else @@ -747,13 +836,13 @@ void thread__find_addr_location(struct thread *self, } int perf_event__preprocess_sample(const union perf_event *event, - struct perf_session *session, + struct machine *machine, struct addr_location *al, struct perf_sample *sample, symbol_filter_t filter) { u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - struct thread *thread = perf_session__findnew(session, event->ip.pid); + struct thread *thread = machine__findnew_thread(machine, event->ip.pid); if (thread == NULL) return -1; @@ -764,18 +853,18 @@ int perf_event__preprocess_sample(const union perf_event *event, dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); /* - * Have we already created the kernel maps for the host machine? + * Have we already created the kernel maps for this machine? * * This should have happened earlier, when we processed the kernel MMAP * events, but for older perf.data files there was no such thing, so do * it now. */ if (cpumode == PERF_RECORD_MISC_KERNEL && - session->host_machine.vmlinux_maps[MAP__FUNCTION] == NULL) - machine__create_kernel_maps(&session->host_machine); + machine->vmlinux_maps[MAP__FUNCTION] == NULL) + machine__create_kernel_maps(machine); - thread__find_addr_map(thread, session, cpumode, MAP__FUNCTION, - event->ip.pid, event->ip.ip, al); + thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION, + event->ip.ip, al); dump_printf(" ...... dso: %s\n", al->map ? al->map->dso->long_name : al->level == 'H' ? "[hypervisor]" : "<not found>"); @@ -783,13 +872,14 @@ int perf_event__preprocess_sample(const union perf_event *event, al->cpu = sample->cpu; if (al->map) { + struct dso *dso = al->map->dso; + if (symbol_conf.dso_list && - (!al->map || !al->map->dso || - !(strlist__has_entry(symbol_conf.dso_list, - al->map->dso->short_name) || - (al->map->dso->short_name != al->map->dso->long_name && - strlist__has_entry(symbol_conf.dso_list, - al->map->dso->long_name))))) + (!dso || !(strlist__has_entry(symbol_conf.dso_list, + dso->short_name) || + (dso->short_name != dso->long_name && + strlist__has_entry(symbol_conf.dso_list, + dso->long_name))))) goto out_filtered; al->sym = map__find_symbol(al->map, al->addr, filter); diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 357a85b8524..cbdeaad9c5e 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -2,6 +2,7 @@ #define __PERF_RECORD_H #include <limits.h> +#include <stdio.h> #include "../perf.h" #include "map.h" @@ -141,43 +142,54 @@ union perf_event { void perf_event__print_totals(void); -struct perf_session; +struct perf_tool; struct thread_map; -typedef int (*perf_event__handler_synth_t)(union perf_event *event, - struct perf_session *session); -typedef int (*perf_event__handler_t)(union perf_event *event, +typedef int (*perf_event__handler_t)(struct perf_tool *tool, + union perf_event *event, struct perf_sample *sample, - struct perf_session *session); + struct machine *machine); -int perf_event__synthesize_thread_map(struct thread_map *threads, +int perf_event__synthesize_thread_map(struct perf_tool *tool, + struct thread_map *threads, perf_event__handler_t process, - struct perf_session *session); -int perf_event__synthesize_threads(perf_event__handler_t process, - struct perf_session *session); -int perf_event__synthesize_kernel_mmap(perf_event__handler_t process, - struct perf_session *session, + struct machine *machine); +int perf_event__synthesize_threads(struct perf_tool *tool, + perf_event__handler_t process, + struct machine *machine); +int perf_event__synthesize_kernel_mmap(struct perf_tool *tool, + perf_event__handler_t process, struct machine *machine, const char *symbol_name); -int perf_event__synthesize_modules(perf_event__handler_t process, - struct perf_session *session, +int perf_event__synthesize_modules(struct perf_tool *tool, + perf_event__handler_t process, struct machine *machine); -int perf_event__process_comm(union perf_event *event, struct perf_sample *sample, - struct perf_session *session); -int perf_event__process_lost(union perf_event *event, struct perf_sample *sample, - struct perf_session *session); -int perf_event__process_mmap(union perf_event *event, struct perf_sample *sample, - struct perf_session *session); -int perf_event__process_task(union perf_event *event, struct perf_sample *sample, - struct perf_session *session); -int perf_event__process(union perf_event *event, struct perf_sample *sample, - struct perf_session *session); +int perf_event__process_comm(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine); +int perf_event__process_lost(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine); +int perf_event__process_mmap(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine); +int perf_event__process_task(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine); +int perf_event__process(struct perf_tool *tool, + union perf_event *event, + struct perf_sample *sample, + struct machine *machine); struct addr_location; int perf_event__preprocess_sample(const union perf_event *self, - struct perf_session *session, + struct machine *machine, struct addr_location *al, struct perf_sample *sample, symbol_filter_t filter); @@ -187,5 +199,13 @@ const char *perf_event__name(unsigned int id); int perf_event__parse_sample(const union perf_event *event, u64 type, int sample_size, bool sample_id_all, struct perf_sample *sample, bool swapped); +int perf_event__synthesize_sample(union perf_event *event, u64 type, + const struct perf_sample *sample, + bool swapped); + +size_t perf_event__fprintf_comm(union perf_event *event, FILE *fp); +size_t perf_event__fprintf_mmap(union perf_event *event, FILE *fp); +size_t perf_event__fprintf_task(union perf_event *event, FILE *fp); +size_t perf_event__fprintf(union perf_event *event, FILE *fp); #endif /* __PERF_RECORD_H */ diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index fbb4b4ab9cc..fa1837088ca 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -6,12 +6,16 @@ * * Released under the GPL v2. (and only v2, not any later version) */ +#include "util.h" +#include "debugfs.h" #include <poll.h> #include "cpumap.h" #include "thread_map.h" #include "evlist.h" #include "evsel.h" -#include "util.h" +#include <unistd.h> + +#include "parse-events.h" #include <sys/mman.h> @@ -30,6 +34,7 @@ void perf_evlist__init(struct perf_evlist *evlist, struct cpu_map *cpus, INIT_HLIST_HEAD(&evlist->heads[i]); INIT_LIST_HEAD(&evlist->entries); perf_evlist__set_maps(evlist, cpus, threads); + evlist->workload.pid = -1; } struct perf_evlist *perf_evlist__new(struct cpu_map *cpus, @@ -43,6 +48,22 @@ struct perf_evlist *perf_evlist__new(struct cpu_map *cpus, return evlist; } +void perf_evlist__config_attrs(struct perf_evlist *evlist, + struct perf_record_opts *opts) +{ + struct perf_evsel *evsel; + + if (evlist->cpus->map[0] < 0) + opts->no_inherit = true; + + list_for_each_entry(evsel, &evlist->entries, node) { + perf_evsel__config(evsel, opts); + + if (evlist->nr_entries > 1) + evsel->attr.sample_type |= PERF_SAMPLE_ID; + } +} + static void perf_evlist__purge(struct perf_evlist *evlist) { struct perf_evsel *pos, *n; @@ -76,6 +97,14 @@ void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry) ++evlist->nr_entries; } +static void perf_evlist__splice_list_tail(struct perf_evlist *evlist, + struct list_head *list, + int nr_entries) +{ + list_splice_tail(list, &evlist->entries); + evlist->nr_entries += nr_entries; +} + int perf_evlist__add_default(struct perf_evlist *evlist) { struct perf_event_attr attr = { @@ -100,6 +129,126 @@ error: return -ENOMEM; } +int perf_evlist__add_attrs(struct perf_evlist *evlist, + struct perf_event_attr *attrs, size_t nr_attrs) +{ + struct perf_evsel *evsel, *n; + LIST_HEAD(head); + size_t i; + + for (i = 0; i < nr_attrs; i++) { + evsel = perf_evsel__new(attrs + i, evlist->nr_entries + i); + if (evsel == NULL) + goto out_delete_partial_list; + list_add_tail(&evsel->node, &head); + } + + perf_evlist__splice_list_tail(evlist, &head, nr_attrs); + + return 0; + +out_delete_partial_list: + list_for_each_entry_safe(evsel, n, &head, node) + perf_evsel__delete(evsel); + return -1; +} + +static int trace_event__id(const char *evname) +{ + char *filename, *colon; + int err = -1, fd; + + if (asprintf(&filename, "%s/%s/id", tracing_events_path, evname) < 0) + return -1; + + colon = strrchr(filename, ':'); + if (colon != NULL) + *colon = '/'; + + fd = open(filename, O_RDONLY); + if (fd >= 0) { + char id[16]; + if (read(fd, id, sizeof(id)) > 0) + err = atoi(id); + close(fd); + } + + free(filename); + return err; +} + +int perf_evlist__add_tracepoints(struct perf_evlist *evlist, + const char *tracepoints[], + size_t nr_tracepoints) +{ + int err; + size_t i; + struct perf_event_attr *attrs = zalloc(nr_tracepoints * sizeof(*attrs)); + + if (attrs == NULL) + return -1; + + for (i = 0; i < nr_tracepoints; i++) { + err = trace_event__id(tracepoints[i]); + + if (err < 0) + goto out_free_attrs; + + attrs[i].type = PERF_TYPE_TRACEPOINT; + attrs[i].config = err; + attrs[i].sample_type = (PERF_SAMPLE_RAW | PERF_SAMPLE_TIME | + PERF_SAMPLE_CPU); + attrs[i].sample_period = 1; + } + + err = perf_evlist__add_attrs(evlist, attrs, nr_tracepoints); +out_free_attrs: + free(attrs); + return err; +} + +static struct perf_evsel * + perf_evlist__find_tracepoint_by_id(struct perf_evlist *evlist, int id) +{ + struct perf_evsel *evsel; + + list_for_each_entry(evsel, &evlist->entries, node) { + if (evsel->attr.type == PERF_TYPE_TRACEPOINT && + (int)evsel->attr.config == id) + return evsel; + } + + return NULL; +} + +int perf_evlist__set_tracepoints_handlers(struct perf_evlist *evlist, + const struct perf_evsel_str_handler *assocs, + size_t nr_assocs) +{ + struct perf_evsel *evsel; + int err; + size_t i; + + for (i = 0; i < nr_assocs; i++) { + err = trace_event__id(assocs[i].name); + if (err < 0) + goto out; + + evsel = perf_evlist__find_tracepoint_by_id(evlist, err); + if (evsel == NULL) + continue; + + err = -EEXIST; + if (evsel->handler.func != NULL) + goto out; + evsel->handler.func = assocs[i].handler; + } + + err = 0; +out: + return err; +} + void perf_evlist__disable(struct perf_evlist *evlist) { int cpu, thread; @@ -126,7 +275,7 @@ void perf_evlist__enable(struct perf_evlist *evlist) } } -int perf_evlist__alloc_pollfd(struct perf_evlist *evlist) +static int perf_evlist__alloc_pollfd(struct perf_evlist *evlist) { int nfds = evlist->cpus->nr * evlist->threads->nr * evlist->nr_entries; evlist->pollfd = malloc(sizeof(struct pollfd) * nfds); @@ -282,7 +431,7 @@ void perf_evlist__munmap(struct perf_evlist *evlist) evlist->mmap = NULL; } -int perf_evlist__alloc_mmap(struct perf_evlist *evlist) +static int perf_evlist__alloc_mmap(struct perf_evlist *evlist) { evlist->nr_mmaps = evlist->cpus->nr; if (evlist->cpus->map[0] == -1) @@ -298,8 +447,10 @@ static int __perf_evlist__mmap(struct perf_evlist *evlist, evlist->mmap[idx].mask = mask; evlist->mmap[idx].base = mmap(NULL, evlist->mmap_len, prot, MAP_SHARED, fd, 0); - if (evlist->mmap[idx].base == MAP_FAILED) + if (evlist->mmap[idx].base == MAP_FAILED) { + evlist->mmap[idx].base = NULL; return -1; + } perf_evlist__add_pollfd(evlist, fd); return 0; @@ -400,14 +551,22 @@ out_unmap: * * Using perf_evlist__read_on_cpu does this automatically. */ -int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite) +int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages, + bool overwrite) { unsigned int page_size = sysconf(_SC_PAGE_SIZE); - int mask = pages * page_size - 1; struct perf_evsel *evsel; const struct cpu_map *cpus = evlist->cpus; const struct thread_map *threads = evlist->threads; - int prot = PROT_READ | (overwrite ? 0 : PROT_WRITE); + int prot = PROT_READ | (overwrite ? 0 : PROT_WRITE), mask; + + /* 512 kiB: default amount of unprivileged mlocked memory */ + if (pages == UINT_MAX) + pages = (512 * 1024) / page_size; + else if (!is_power_of_2(pages)) + return -EINVAL; + + mask = pages * page_size - 1; if (evlist->mmap == NULL && perf_evlist__alloc_mmap(evlist) < 0) return -ENOMEM; @@ -512,6 +671,38 @@ u64 perf_evlist__sample_type(const struct perf_evlist *evlist) return first->attr.sample_type; } +u16 perf_evlist__id_hdr_size(const struct perf_evlist *evlist) +{ + struct perf_evsel *first; + struct perf_sample *data; + u64 sample_type; + u16 size = 0; + + first = list_entry(evlist->entries.next, struct perf_evsel, node); + + if (!first->attr.sample_id_all) + goto out; + + sample_type = first->attr.sample_type; + + if (sample_type & PERF_SAMPLE_TID) + size += sizeof(data->tid) * 2; + + if (sample_type & PERF_SAMPLE_TIME) + size += sizeof(data->time); + + if (sample_type & PERF_SAMPLE_ID) + size += sizeof(data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + size += sizeof(data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + size += sizeof(data->cpu) * 2; +out: + return size; +} + bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist) { struct perf_evsel *pos, *first; @@ -569,3 +760,97 @@ out_err: return err; } + +int perf_evlist__prepare_workload(struct perf_evlist *evlist, + struct perf_record_opts *opts, + const char *argv[]) +{ + int child_ready_pipe[2], go_pipe[2]; + char bf; + + if (pipe(child_ready_pipe) < 0) { + perror("failed to create 'ready' pipe"); + return -1; + } + + if (pipe(go_pipe) < 0) { + perror("failed to create 'go' pipe"); + goto out_close_ready_pipe; + } + + evlist->workload.pid = fork(); + if (evlist->workload.pid < 0) { + perror("failed to fork"); + goto out_close_pipes; + } + + if (!evlist->workload.pid) { + if (opts->pipe_output) + dup2(2, 1); + + close(child_ready_pipe[0]); + close(go_pipe[1]); + fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC); + + /* + * Do a dummy execvp to get the PLT entry resolved, + * so we avoid the resolver overhead on the real + * execvp call. + */ + execvp("", (char **)argv); + + /* + * Tell the parent we're ready to go + */ + close(child_ready_pipe[1]); + + /* + * Wait until the parent tells us to go. + */ + if (read(go_pipe[0], &bf, 1) == -1) + perror("unable to read pipe"); + + execvp(argv[0], (char **)argv); + + perror(argv[0]); + kill(getppid(), SIGUSR1); + exit(-1); + } + + if (!opts->system_wide && opts->target_tid == -1 && opts->target_pid == -1) + evlist->threads->map[0] = evlist->workload.pid; + + close(child_ready_pipe[1]); + close(go_pipe[0]); + /* + * wait for child to settle + */ + if (read(child_ready_pipe[0], &bf, 1) == -1) { + perror("unable to read pipe"); + goto out_close_pipes; + } + + evlist->workload.cork_fd = go_pipe[1]; + close(child_ready_pipe[0]); + return 0; + +out_close_pipes: + close(go_pipe[0]); + close(go_pipe[1]); +out_close_ready_pipe: + close(child_ready_pipe[0]); + close(child_ready_pipe[1]); + return -1; +} + +int perf_evlist__start_workload(struct perf_evlist *evlist) +{ + if (evlist->workload.cork_fd > 0) { + /* + * Remove the cork, let it rip! + */ + return close(evlist->workload.cork_fd); + } + + return 0; +} diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 1779ffef782..8922aeed046 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -2,12 +2,16 @@ #define __PERF_EVLIST_H 1 #include <linux/list.h> +#include <stdio.h> #include "../perf.h" #include "event.h" +#include "util.h" +#include <unistd.h> struct pollfd; struct thread_map; struct cpu_map; +struct perf_record_opts; #define PERF_EVLIST__HLIST_BITS 8 #define PERF_EVLIST__HLIST_SIZE (1 << PERF_EVLIST__HLIST_BITS) @@ -19,6 +23,10 @@ struct perf_evlist { int nr_fds; int nr_mmaps; int mmap_len; + struct { + int cork_fd; + pid_t pid; + } workload; bool overwrite; union perf_event event_copy; struct perf_mmap *mmap; @@ -28,6 +36,11 @@ struct perf_evlist { struct perf_evsel *selected; }; +struct perf_evsel_str_handler { + const char *name; + void *handler; +}; + struct perf_evsel; struct perf_evlist *perf_evlist__new(struct cpu_map *cpus, @@ -39,11 +52,26 @@ void perf_evlist__delete(struct perf_evlist *evlist); void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry); int perf_evlist__add_default(struct perf_evlist *evlist); +int perf_evlist__add_attrs(struct perf_evlist *evlist, + struct perf_event_attr *attrs, size_t nr_attrs); +int perf_evlist__add_tracepoints(struct perf_evlist *evlist, + const char *tracepoints[], size_t nr_tracepoints); +int perf_evlist__set_tracepoints_handlers(struct perf_evlist *evlist, + const struct perf_evsel_str_handler *assocs, + size_t nr_assocs); + +#define perf_evlist__add_attrs_array(evlist, array) \ + perf_evlist__add_attrs(evlist, array, ARRAY_SIZE(array)) + +#define perf_evlist__add_tracepoints_array(evlist, array) \ + perf_evlist__add_tracepoints(evlist, array, ARRAY_SIZE(array)) + +#define perf_evlist__set_tracepoints_handlers_array(evlist, array) \ + perf_evlist__set_tracepoints_handlers(evlist, array, ARRAY_SIZE(array)) void perf_evlist__id_add(struct perf_evlist *evlist, struct perf_evsel *evsel, int cpu, int thread, u64 id); -int perf_evlist__alloc_pollfd(struct perf_evlist *evlist); void perf_evlist__add_pollfd(struct perf_evlist *evlist, int fd); struct perf_evsel *perf_evlist__id2evsel(struct perf_evlist *evlist, u64 id); @@ -52,8 +80,16 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *self, int idx); int perf_evlist__open(struct perf_evlist *evlist, bool group); -int perf_evlist__alloc_mmap(struct perf_evlist *evlist); -int perf_evlist__mmap(struct perf_evlist *evlist, int pages, bool overwrite); +void perf_evlist__config_attrs(struct perf_evlist *evlist, + struct perf_record_opts *opts); + +int perf_evlist__prepare_workload(struct perf_evlist *evlist, + struct perf_record_opts *opts, + const char *argv[]); +int perf_evlist__start_workload(struct perf_evlist *evlist); + +int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages, + bool overwrite); void perf_evlist__munmap(struct perf_evlist *evlist); void perf_evlist__disable(struct perf_evlist *evlist); @@ -77,6 +113,7 @@ int perf_evlist__set_filters(struct perf_evlist *evlist); u64 perf_evlist__sample_type(const struct perf_evlist *evlist); bool perf_evlist__sample_id_all(const const struct perf_evlist *evlist); +u16 perf_evlist__id_hdr_size(const struct perf_evlist *evlist); bool perf_evlist__valid_sample_type(const struct perf_evlist *evlist); bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist); diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index d7915d4e77c..667f3b78bb2 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -63,6 +63,79 @@ struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx) return evsel; } +void perf_evsel__config(struct perf_evsel *evsel, struct perf_record_opts *opts) +{ + struct perf_event_attr *attr = &evsel->attr; + int track = !evsel->idx; /* only the first counter needs these */ + + attr->sample_id_all = opts->sample_id_all_avail ? 1 : 0; + attr->inherit = !opts->no_inherit; + attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING | + PERF_FORMAT_ID; + + attr->sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID; + + /* + * We default some events to a 1 default interval. But keep + * it a weak assumption overridable by the user. + */ + if (!attr->sample_period || (opts->user_freq != UINT_MAX && + opts->user_interval != ULLONG_MAX)) { + if (opts->freq) { + attr->sample_type |= PERF_SAMPLE_PERIOD; + attr->freq = 1; + attr->sample_freq = opts->freq; + } else { + attr->sample_period = opts->default_interval; + } + } + + if (opts->no_samples) + attr->sample_freq = 0; + + if (opts->inherit_stat) + attr->inherit_stat = 1; + + if (opts->sample_address) { + attr->sample_type |= PERF_SAMPLE_ADDR; + attr->mmap_data = track; + } + + if (opts->call_graph) + attr->sample_type |= PERF_SAMPLE_CALLCHAIN; + + if (opts->system_wide) + attr->sample_type |= PERF_SAMPLE_CPU; + + if (opts->period) + attr->sample_type |= PERF_SAMPLE_PERIOD; + + if (opts->sample_id_all_avail && + (opts->sample_time || opts->system_wide || + !opts->no_inherit || opts->cpu_list)) + attr->sample_type |= PERF_SAMPLE_TIME; + + if (opts->raw_samples) { + attr->sample_type |= PERF_SAMPLE_TIME; + attr->sample_type |= PERF_SAMPLE_RAW; + attr->sample_type |= PERF_SAMPLE_CPU; + } + + if (opts->no_delay) { + attr->watermark = 0; + attr->wakeup_events = 1; + } + + attr->mmap = track; + attr->comm = track; + + if (opts->target_pid == -1 && opts->target_tid == -1 && !opts->system_wide) { + attr->disabled = 1; + attr->enable_on_exec = 1; + } +} + int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads) { int cpu, thread; @@ -387,7 +460,7 @@ int perf_event__parse_sample(const union perf_event *event, u64 type, u32 val32[2]; } u; - + memset(data, 0, sizeof(*data)); data->cpu = data->pid = data->tid = -1; data->stream_id = data->id = data->time = -1ULL; @@ -504,3 +577,82 @@ int perf_event__parse_sample(const union perf_event *event, u64 type, return 0; } + +int perf_event__synthesize_sample(union perf_event *event, u64 type, + const struct perf_sample *sample, + bool swapped) +{ + u64 *array; + + /* + * used for cross-endian analysis. See git commit 65014ab3 + * for why this goofiness is needed. + */ + union { + u64 val64; + u32 val32[2]; + } u; + + array = event->sample.array; + + if (type & PERF_SAMPLE_IP) { + event->ip.ip = sample->ip; + array++; + } + + if (type & PERF_SAMPLE_TID) { + u.val32[0] = sample->pid; + u.val32[1] = sample->tid; + if (swapped) { + /* + * Inverse of what is done in perf_event__parse_sample + */ + u.val32[0] = bswap_32(u.val32[0]); + u.val32[1] = bswap_32(u.val32[1]); + u.val64 = bswap_64(u.val64); + } + + *array = u.val64; + array++; + } + + if (type & PERF_SAMPLE_TIME) { + *array = sample->time; + array++; + } + + if (type & PERF_SAMPLE_ADDR) { + *array = sample->addr; + array++; + } + + if (type & PERF_SAMPLE_ID) { + *array = sample->id; + array++; + } + + if (type & PERF_SAMPLE_STREAM_ID) { + *array = sample->stream_id; + array++; + } + + if (type & PERF_SAMPLE_CPU) { + u.val32[0] = sample->cpu; + if (swapped) { + /* + * Inverse of what is done in perf_event__parse_sample + */ + u.val32[0] = bswap_32(u.val32[0]); + u.val64 = bswap_64(u.val64); + } + *array = u.val64; + array++; + } + + if (type & PERF_SAMPLE_PERIOD) { + *array = sample->period; + array++; + } + + return 0; +} diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index b1d15e6f7ae..326b8e4d503 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -61,12 +61,17 @@ struct perf_evsel { off_t id_offset; }; struct cgroup_sel *cgrp; + struct { + void *func; + void *data; + } handler; bool supported; }; struct cpu_map; struct thread_map; struct perf_evlist; +struct perf_record_opts; struct perf_evsel *perf_evsel__new(struct perf_event_attr *attr, int idx); void perf_evsel__init(struct perf_evsel *evsel, @@ -74,6 +79,9 @@ void perf_evsel__init(struct perf_evsel *evsel, void perf_evsel__exit(struct perf_evsel *evsel); void perf_evsel__delete(struct perf_evsel *evsel); +void perf_evsel__config(struct perf_evsel *evsel, + struct perf_record_opts *opts); + int perf_evsel__alloc_fd(struct perf_evsel *evsel, int ncpus, int nthreads); int perf_evsel__alloc_id(struct perf_evsel *evsel, int ncpus, int nthreads); int perf_evsel__alloc_counts(struct perf_evsel *evsel, int ncpus); diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 33c17a2b2a8..3e7e0b09c12 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -8,6 +8,7 @@ #include <stdlib.h> #include <linux/list.h> #include <linux/kernel.h> +#include <linux/bitops.h> #include <sys/utsname.h> #include "evlist.h" @@ -28,9 +29,6 @@ static struct perf_trace_event_type *events; static u32 header_argc; static const char **header_argv; -static int dsos__write_buildid_table(struct perf_header *header, int fd); -static int perf_session__cache_build_ids(struct perf_session *session); - int perf_header__push_event(u64 id, const char *name) { if (strlen(name) > MAX_EVENT_NAME) @@ -187,6 +185,252 @@ perf_header__set_cmdline(int argc, const char **argv) return 0; } +#define dsos__for_each_with_build_id(pos, head) \ + list_for_each_entry(pos, head, node) \ + if (!pos->has_build_id) \ + continue; \ + else + +static int __dsos__write_buildid_table(struct list_head *head, pid_t pid, + u16 misc, int fd) +{ + struct dso *pos; + + dsos__for_each_with_build_id(pos, head) { + int err; + struct build_id_event b; + size_t len; + + if (!pos->hit) + continue; + len = pos->long_name_len + 1; + len = ALIGN(len, NAME_ALIGN); + memset(&b, 0, sizeof(b)); + memcpy(&b.build_id, pos->build_id, sizeof(pos->build_id)); + b.pid = pid; + b.header.misc = misc; + b.header.size = sizeof(b) + len; + err = do_write(fd, &b, sizeof(b)); + if (err < 0) + return err; + err = write_padded(fd, pos->long_name, + pos->long_name_len + 1, len); + if (err < 0) + return err; + } + + return 0; +} + +static int machine__write_buildid_table(struct machine *machine, int fd) +{ + int err; + u16 kmisc = PERF_RECORD_MISC_KERNEL, + umisc = PERF_RECORD_MISC_USER; + + if (!machine__is_host(machine)) { + kmisc = PERF_RECORD_MISC_GUEST_KERNEL; + umisc = PERF_RECORD_MISC_GUEST_USER; + } + + err = __dsos__write_buildid_table(&machine->kernel_dsos, machine->pid, + kmisc, fd); + if (err == 0) + err = __dsos__write_buildid_table(&machine->user_dsos, + machine->pid, umisc, fd); + return err; +} + +static int dsos__write_buildid_table(struct perf_header *header, int fd) +{ + struct perf_session *session = container_of(header, + struct perf_session, header); + struct rb_node *nd; + int err = machine__write_buildid_table(&session->host_machine, fd); + + if (err) + return err; + + for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { + struct machine *pos = rb_entry(nd, struct machine, rb_node); + err = machine__write_buildid_table(pos, fd); + if (err) + break; + } + return err; +} + +int build_id_cache__add_s(const char *sbuild_id, const char *debugdir, + const char *name, bool is_kallsyms) +{ + const size_t size = PATH_MAX; + char *realname, *filename = zalloc(size), + *linkname = zalloc(size), *targetname; + int len, err = -1; + + if (is_kallsyms) { + if (symbol_conf.kptr_restrict) { + pr_debug("Not caching a kptr_restrict'ed /proc/kallsyms\n"); + return 0; + } + realname = (char *)name; + } else + realname = realpath(name, NULL); + + if (realname == NULL || filename == NULL || linkname == NULL) + goto out_free; + + len = snprintf(filename, size, "%s%s%s", + debugdir, is_kallsyms ? "/" : "", realname); + if (mkdir_p(filename, 0755)) + goto out_free; + + snprintf(filename + len, sizeof(filename) - len, "/%s", sbuild_id); + + if (access(filename, F_OK)) { + if (is_kallsyms) { + if (copyfile("/proc/kallsyms", filename)) + goto out_free; + } else if (link(realname, filename) && copyfile(name, filename)) + goto out_free; + } + + len = snprintf(linkname, size, "%s/.build-id/%.2s", + debugdir, sbuild_id); + + if (access(linkname, X_OK) && mkdir_p(linkname, 0755)) + goto out_free; + + snprintf(linkname + len, size - len, "/%s", sbuild_id + 2); + targetname = filename + strlen(debugdir) - 5; + memcpy(targetname, "../..", 5); + + if (symlink(targetname, linkname) == 0) + err = 0; +out_free: + if (!is_kallsyms) + free(realname); + free(filename); + free(linkname); + return err; +} + +static int build_id_cache__add_b(const u8 *build_id, size_t build_id_size, + const char *name, const char *debugdir, + bool is_kallsyms) +{ + char sbuild_id[BUILD_ID_SIZE * 2 + 1]; + + build_id__sprintf(build_id, build_id_size, sbuild_id); + + return build_id_cache__add_s(sbuild_id, debugdir, name, is_kallsyms); +} + +int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir) +{ + const size_t size = PATH_MAX; + char *filename = zalloc(size), + *linkname = zalloc(size); + int err = -1; + + if (filename == NULL || linkname == NULL) + goto out_free; + + snprintf(linkname, size, "%s/.build-id/%.2s/%s", + debugdir, sbuild_id, sbuild_id + 2); + + if (access(linkname, F_OK)) + goto out_free; + + if (readlink(linkname, filename, size - 1) < 0) + goto out_free; + + if (unlink(linkname)) + goto out_free; + + /* + * Since the link is relative, we must make it absolute: + */ + snprintf(linkname, size, "%s/.build-id/%.2s/%s", + debugdir, sbuild_id, filename); + + if (unlink(linkname)) + goto out_free; + + err = 0; +out_free: + free(filename); + free(linkname); + return err; +} + +static int dso__cache_build_id(struct dso *dso, const char *debugdir) +{ + bool is_kallsyms = dso->kernel && dso->long_name[0] != '/'; + + return build_id_cache__add_b(dso->build_id, sizeof(dso->build_id), + dso->long_name, debugdir, is_kallsyms); +} + +static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir) +{ + struct dso *pos; + int err = 0; + + dsos__for_each_with_build_id(pos, head) + if (dso__cache_build_id(pos, debugdir)) + err = -1; + + return err; +} + +static int machine__cache_build_ids(struct machine *machine, const char *debugdir) +{ + int ret = __dsos__cache_build_ids(&machine->kernel_dsos, debugdir); + ret |= __dsos__cache_build_ids(&machine->user_dsos, debugdir); + return ret; +} + +static int perf_session__cache_build_ids(struct perf_session *session) +{ + struct rb_node *nd; + int ret; + char debugdir[PATH_MAX]; + + snprintf(debugdir, sizeof(debugdir), "%s", buildid_dir); + + if (mkdir(debugdir, 0755) != 0 && errno != EEXIST) + return -1; + + ret = machine__cache_build_ids(&session->host_machine, debugdir); + + for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { + struct machine *pos = rb_entry(nd, struct machine, rb_node); + ret |= machine__cache_build_ids(pos, debugdir); + } + return ret ? -1 : 0; +} + +static bool machine__read_build_ids(struct machine *machine, bool with_hits) +{ + bool ret = __dsos__read_build_ids(&machine->kernel_dsos, with_hits); + ret |= __dsos__read_build_ids(&machine->user_dsos, with_hits); + return ret; +} + +static bool perf_session__read_build_ids(struct perf_session *session, bool with_hits) +{ + struct rb_node *nd; + bool ret = machine__read_build_ids(&session->host_machine, with_hits); + + for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { + struct machine *pos = rb_entry(nd, struct machine, rb_node); + ret |= machine__read_build_ids(pos, with_hits); + } + + return ret; +} + static int write_trace_info(int fd, struct perf_header *h __used, struct perf_evlist *evlist) { @@ -202,6 +446,9 @@ static int write_build_id(int fd, struct perf_header *h, session = container_of(h, struct perf_session, header); + if (!perf_session__read_build_ids(session, true)) + return -1; + err = dsos__write_buildid_table(h, fd); if (err < 0) { pr_debug("failed to write buildid table\n"); @@ -1065,26 +1312,30 @@ struct feature_ops { bool full_only; }; -#define FEAT_OPA(n, w, p) \ - [n] = { .name = #n, .write = w, .print = p } -#define FEAT_OPF(n, w, p) \ - [n] = { .name = #n, .write = w, .print = p, .full_only = true } +#define FEAT_OPA(n, func) \ + [n] = { .name = #n, .write = write_##func, .print = print_##func } +#define FEAT_OPF(n, func) \ + [n] = { .name = #n, .write = write_##func, .print = print_##func, .full_only = true } + +/* feature_ops not implemented: */ +#define print_trace_info NULL +#define print_build_id NULL static const struct feature_ops feat_ops[HEADER_LAST_FEATURE] = { - FEAT_OPA(HEADER_TRACE_INFO, write_trace_info, NULL), - FEAT_OPA(HEADER_BUILD_ID, write_build_id, NULL), - FEAT_OPA(HEADER_HOSTNAME, write_hostname, print_hostname), - FEAT_OPA(HEADER_OSRELEASE, write_osrelease, print_osrelease), - FEAT_OPA(HEADER_VERSION, write_version, print_version), - FEAT_OPA(HEADER_ARCH, write_arch, print_arch), - FEAT_OPA(HEADER_NRCPUS, write_nrcpus, print_nrcpus), - FEAT_OPA(HEADER_CPUDESC, write_cpudesc, print_cpudesc), - FEAT_OPA(HEADER_CPUID, write_cpuid, print_cpuid), - FEAT_OPA(HEADER_TOTAL_MEM, write_total_mem, print_total_mem), - FEAT_OPA(HEADER_EVENT_DESC, write_event_desc, print_event_desc), - FEAT_OPA(HEADER_CMDLINE, write_cmdline, print_cmdline), - FEAT_OPF(HEADER_CPU_TOPOLOGY, write_cpu_topology, print_cpu_topology), - FEAT_OPF(HEADER_NUMA_TOPOLOGY, write_numa_topology, print_numa_topology), + FEAT_OPA(HEADER_TRACE_INFO, trace_info), + FEAT_OPA(HEADER_BUILD_ID, build_id), + FEAT_OPA(HEADER_HOSTNAME, hostname), + FEAT_OPA(HEADER_OSRELEASE, osrelease), + FEAT_OPA(HEADER_VERSION, version), + FEAT_OPA(HEADER_ARCH, arch), + FEAT_OPA(HEADER_NRCPUS, nrcpus), + FEAT_OPA(HEADER_CPUDESC, cpudesc), + FEAT_OPA(HEADER_CPUID, cpuid), + FEAT_OPA(HEADER_TOTAL_MEM, total_mem), + FEAT_OPA(HEADER_EVENT_DESC, event_desc), + FEAT_OPA(HEADER_CMDLINE, cmdline), + FEAT_OPF(HEADER_CPU_TOPOLOGY, cpu_topology), + FEAT_OPF(HEADER_NUMA_TOPOLOGY, numa_topology), }; struct header_print_data { @@ -1103,9 +1354,9 @@ static int perf_file_section__fprintf_info(struct perf_file_section *section, "%d, continuing...\n", section->offset, feat); return 0; } - if (feat < HEADER_TRACE_INFO || feat >= HEADER_LAST_FEATURE) { + if (feat >= HEADER_LAST_FEATURE) { pr_warning("unknown feature %d\n", feat); - return -1; + return 0; } if (!feat_ops[feat].print) return 0; @@ -1132,252 +1383,6 @@ int perf_header__fprintf_info(struct perf_session *session, FILE *fp, bool full) return 0; } -#define dsos__for_each_with_build_id(pos, head) \ - list_for_each_entry(pos, head, node) \ - if (!pos->has_build_id) \ - continue; \ - else - -static int __dsos__write_buildid_table(struct list_head *head, pid_t pid, - u16 misc, int fd) -{ - struct dso *pos; - - dsos__for_each_with_build_id(pos, head) { - int err; - struct build_id_event b; - size_t len; - - if (!pos->hit) - continue; - len = pos->long_name_len + 1; - len = ALIGN(len, NAME_ALIGN); - memset(&b, 0, sizeof(b)); - memcpy(&b.build_id, pos->build_id, sizeof(pos->build_id)); - b.pid = pid; - b.header.misc = misc; - b.header.size = sizeof(b) + len; - err = do_write(fd, &b, sizeof(b)); - if (err < 0) - return err; - err = write_padded(fd, pos->long_name, - pos->long_name_len + 1, len); - if (err < 0) - return err; - } - - return 0; -} - -static int machine__write_buildid_table(struct machine *machine, int fd) -{ - int err; - u16 kmisc = PERF_RECORD_MISC_KERNEL, - umisc = PERF_RECORD_MISC_USER; - - if (!machine__is_host(machine)) { - kmisc = PERF_RECORD_MISC_GUEST_KERNEL; - umisc = PERF_RECORD_MISC_GUEST_USER; - } - - err = __dsos__write_buildid_table(&machine->kernel_dsos, machine->pid, - kmisc, fd); - if (err == 0) - err = __dsos__write_buildid_table(&machine->user_dsos, - machine->pid, umisc, fd); - return err; -} - -static int dsos__write_buildid_table(struct perf_header *header, int fd) -{ - struct perf_session *session = container_of(header, - struct perf_session, header); - struct rb_node *nd; - int err = machine__write_buildid_table(&session->host_machine, fd); - - if (err) - return err; - - for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { - struct machine *pos = rb_entry(nd, struct machine, rb_node); - err = machine__write_buildid_table(pos, fd); - if (err) - break; - } - return err; -} - -int build_id_cache__add_s(const char *sbuild_id, const char *debugdir, - const char *name, bool is_kallsyms) -{ - const size_t size = PATH_MAX; - char *realname, *filename = zalloc(size), - *linkname = zalloc(size), *targetname; - int len, err = -1; - - if (is_kallsyms) { - if (symbol_conf.kptr_restrict) { - pr_debug("Not caching a kptr_restrict'ed /proc/kallsyms\n"); - return 0; - } - realname = (char *)name; - } else - realname = realpath(name, NULL); - - if (realname == NULL || filename == NULL || linkname == NULL) - goto out_free; - - len = snprintf(filename, size, "%s%s%s", - debugdir, is_kallsyms ? "/" : "", realname); - if (mkdir_p(filename, 0755)) - goto out_free; - - snprintf(filename + len, sizeof(filename) - len, "/%s", sbuild_id); - - if (access(filename, F_OK)) { - if (is_kallsyms) { - if (copyfile("/proc/kallsyms", filename)) - goto out_free; - } else if (link(realname, filename) && copyfile(name, filename)) - goto out_free; - } - - len = snprintf(linkname, size, "%s/.build-id/%.2s", - debugdir, sbuild_id); - - if (access(linkname, X_OK) && mkdir_p(linkname, 0755)) - goto out_free; - - snprintf(linkname + len, size - len, "/%s", sbuild_id + 2); - targetname = filename + strlen(debugdir) - 5; - memcpy(targetname, "../..", 5); - - if (symlink(targetname, linkname) == 0) - err = 0; -out_free: - if (!is_kallsyms) - free(realname); - free(filename); - free(linkname); - return err; -} - -static int build_id_cache__add_b(const u8 *build_id, size_t build_id_size, - const char *name, const char *debugdir, - bool is_kallsyms) -{ - char sbuild_id[BUILD_ID_SIZE * 2 + 1]; - - build_id__sprintf(build_id, build_id_size, sbuild_id); - - return build_id_cache__add_s(sbuild_id, debugdir, name, is_kallsyms); -} - -int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir) -{ - const size_t size = PATH_MAX; - char *filename = zalloc(size), - *linkname = zalloc(size); - int err = -1; - - if (filename == NULL || linkname == NULL) - goto out_free; - - snprintf(linkname, size, "%s/.build-id/%.2s/%s", - debugdir, sbuild_id, sbuild_id + 2); - - if (access(linkname, F_OK)) - goto out_free; - - if (readlink(linkname, filename, size - 1) < 0) - goto out_free; - - if (unlink(linkname)) - goto out_free; - - /* - * Since the link is relative, we must make it absolute: - */ - snprintf(linkname, size, "%s/.build-id/%.2s/%s", - debugdir, sbuild_id, filename); - - if (unlink(linkname)) - goto out_free; - - err = 0; -out_free: - free(filename); - free(linkname); - return err; -} - -static int dso__cache_build_id(struct dso *dso, const char *debugdir) -{ - bool is_kallsyms = dso->kernel && dso->long_name[0] != '/'; - - return build_id_cache__add_b(dso->build_id, sizeof(dso->build_id), - dso->long_name, debugdir, is_kallsyms); -} - -static int __dsos__cache_build_ids(struct list_head *head, const char *debugdir) -{ - struct dso *pos; - int err = 0; - - dsos__for_each_with_build_id(pos, head) - if (dso__cache_build_id(pos, debugdir)) - err = -1; - - return err; -} - -static int machine__cache_build_ids(struct machine *machine, const char *debugdir) -{ - int ret = __dsos__cache_build_ids(&machine->kernel_dsos, debugdir); - ret |= __dsos__cache_build_ids(&machine->user_dsos, debugdir); - return ret; -} - -static int perf_session__cache_build_ids(struct perf_session *session) -{ - struct rb_node *nd; - int ret; - char debugdir[PATH_MAX]; - - snprintf(debugdir, sizeof(debugdir), "%s", buildid_dir); - - if (mkdir(debugdir, 0755) != 0 && errno != EEXIST) - return -1; - - ret = machine__cache_build_ids(&session->host_machine, debugdir); - - for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { - struct machine *pos = rb_entry(nd, struct machine, rb_node); - ret |= machine__cache_build_ids(pos, debugdir); - } - return ret ? -1 : 0; -} - -static bool machine__read_build_ids(struct machine *machine, bool with_hits) -{ - bool ret = __dsos__read_build_ids(&machine->kernel_dsos, with_hits); - ret |= __dsos__read_build_ids(&machine->user_dsos, with_hits); - return ret; -} - -static bool perf_session__read_build_ids(struct perf_session *session, bool with_hits) -{ - struct rb_node *nd; - bool ret = machine__read_build_ids(&session->host_machine, with_hits); - - for (nd = rb_first(&session->machines); nd; nd = rb_next(nd)) { - struct machine *pos = rb_entry(nd, struct machine, rb_node); - ret |= machine__read_build_ids(pos, with_hits); - } - - return ret; -} - static int do_write_feat(int fd, struct perf_header *h, int type, struct perf_file_section **p, struct perf_evlist *evlist) @@ -1386,6 +1391,8 @@ static int do_write_feat(int fd, struct perf_header *h, int type, int ret = 0; if (perf_header__has_feat(h, type)) { + if (!feat_ops[type].write) + return -1; (*p)->offset = lseek(fd, 0, SEEK_CUR); @@ -1408,18 +1415,12 @@ static int perf_header__adds_write(struct perf_header *header, struct perf_evlist *evlist, int fd) { int nr_sections; - struct perf_session *session; struct perf_file_section *feat_sec, *p; int sec_size; u64 sec_start; + int feat; int err; - session = container_of(header, struct perf_session, header); - - if (perf_header__has_feat(header, HEADER_BUILD_ID && - !perf_session__read_build_ids(session, true))) - perf_header__clear_feat(header, HEADER_BUILD_ID); - nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS); if (!nr_sections) return 0; @@ -1433,64 +1434,11 @@ static int perf_header__adds_write(struct perf_header *header, sec_start = header->data_offset + header->data_size; lseek(fd, sec_start + sec_size, SEEK_SET); - err = do_write_feat(fd, header, HEADER_TRACE_INFO, &p, evlist); - if (err) - goto out_free; - - err = do_write_feat(fd, header, HEADER_BUILD_ID, &p, evlist); - if (err) { - perf_header__clear_feat(header, HEADER_BUILD_ID); - goto out_free; + for_each_set_bit(feat, header->adds_features, HEADER_FEAT_BITS) { + if (do_write_feat(fd, header, feat, &p, evlist)) + perf_header__clear_feat(header, feat); } - err = do_write_feat(fd, header, HEADER_HOSTNAME, &p, evlist); - if (err) - perf_header__clear_feat(header, HEADER_HOSTNAME); - - err = do_write_feat(fd, header, HEADER_OSRELEASE, &p, evlist); - if (err) - perf_header__clear_feat(header, HEADER_OSRELEASE); - - err = do_write_feat(fd, header, HEADER_VERSION, &p, evlist); - if (err) - perf_header__clear_feat(header, HEADER_VERSION); - - err = do_write_feat(fd, header, HEADER_ARCH, &p, evlist); - if (err) - perf_header__clear_feat(header, HEADER_ARCH); - - err = do_write_feat(fd, header, HEADER_NRCPUS, &p, evlist); - if (err) - perf_header__clear_feat(header, HEADER_NRCPUS); - - err = do_write_feat(fd, header, HEADER_CPUDESC, &p, evlist); - if (err) - perf_header__clear_feat(header, HEADER_CPUDESC); - - err = do_write_feat(fd, header, HEADER_CPUID, &p, evlist); - if (err) - perf_header__clear_feat(header, HEADER_CPUID); - - err = do_write_feat(fd, header, HEADER_TOTAL_MEM, &p, evlist); - if (err) - perf_header__clear_feat(header, HEADER_TOTAL_MEM); - - err = do_write_feat(fd, header, HEADER_CMDLINE, &p, evlist); - if (err) - perf_header__clear_feat(header, HEADER_CMDLINE); - - err = do_write_feat(fd, header, HEADER_EVENT_DESC, &p, evlist); - if (err) - perf_header__clear_feat(header, HEADER_EVENT_DESC); - - err = do_write_feat(fd, header, HEADER_CPU_TOPOLOGY, &p, evlist); - if (err) - perf_header__clear_feat(header, HEADER_CPU_TOPOLOGY); - - err = do_write_feat(fd, header, HEADER_NUMA_TOPOLOGY, &p, evlist); - if (err) - perf_header__clear_feat(header, HEADER_NUMA_TOPOLOGY); - lseek(fd, sec_start, SEEK_SET); /* * may write more than needed due to dropped feature, but @@ -1499,7 +1447,6 @@ static int perf_header__adds_write(struct perf_header *header, err = do_write(fd, feat_sec, sec_size); if (err < 0) pr_debug("failed to write feature section\n"); -out_free: free(feat_sec); return err; } @@ -1637,20 +1584,20 @@ static int perf_header__getbuffer64(struct perf_header *header, int perf_header__process_sections(struct perf_header *header, int fd, void *data, int (*process)(struct perf_file_section *section, - struct perf_header *ph, - int feat, int fd, void *data)) + struct perf_header *ph, + int feat, int fd, void *data)) { - struct perf_file_section *feat_sec; + struct perf_file_section *feat_sec, *sec; int nr_sections; int sec_size; - int idx = 0; - int err = -1, feat = 1; + int feat; + int err; nr_sections = bitmap_weight(header->adds_features, HEADER_FEAT_BITS); if (!nr_sections) return 0; - feat_sec = calloc(sizeof(*feat_sec), nr_sections); + feat_sec = sec = calloc(sizeof(*feat_sec), nr_sections); if (!feat_sec) return -1; @@ -1658,20 +1605,16 @@ int perf_header__process_sections(struct perf_header *header, int fd, lseek(fd, header->data_offset + header->data_size, SEEK_SET); - if (perf_header__getbuffer64(header, fd, feat_sec, sec_size)) + err = perf_header__getbuffer64(header, fd, feat_sec, sec_size); + if (err < 0) goto out_free; - err = 0; - while (idx < nr_sections && feat < HEADER_LAST_FEATURE) { - if (perf_header__has_feat(header, feat)) { - struct perf_file_section *sec = &feat_sec[idx++]; - - err = process(sec, header, feat, fd, data); - if (err < 0) - break; - } - ++feat; + for_each_set_bit(feat, header->adds_features, HEADER_LAST_FEATURE) { + err = process(sec++, header, feat, fd, data); + if (err < 0) + goto out_free; } + err = 0; out_free: free(feat_sec); return err; @@ -1906,32 +1849,21 @@ static int perf_file_section__process(struct perf_file_section *section, return 0; } + if (feat >= HEADER_LAST_FEATURE) { + pr_debug("unknown feature %d, continuing...\n", feat); + return 0; + } + switch (feat) { case HEADER_TRACE_INFO: trace_report(fd, false); break; - case HEADER_BUILD_ID: if (perf_header__read_build_ids(ph, fd, section->offset, section->size)) pr_debug("Failed to read buildids, continuing...\n"); break; - - case HEADER_HOSTNAME: - case HEADER_OSRELEASE: - case HEADER_VERSION: - case HEADER_ARCH: - case HEADER_NRCPUS: - case HEADER_CPUDESC: - case HEADER_CPUID: - case HEADER_TOTAL_MEM: - case HEADER_CMDLINE: - case HEADER_EVENT_DESC: - case HEADER_CPU_TOPOLOGY: - case HEADER_NUMA_TOPOLOGY: - break; - default: - pr_debug("unknown feature %d, continuing...\n", feat); + break; } return 0; @@ -2041,6 +1973,8 @@ int perf_session__read_header(struct perf_session *session, int fd) lseek(fd, tmp, SEEK_SET); } + symbol_conf.nr_events = nr_attrs; + if (f_header.event_types.size) { lseek(fd, f_header.event_types.offset, SEEK_SET); events = malloc(f_header.event_types.size); @@ -2068,9 +2002,9 @@ out_delete_evlist: return -ENOMEM; } -int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, - perf_event__handler_t process, - struct perf_session *session) +int perf_event__synthesize_attr(struct perf_tool *tool, + struct perf_event_attr *attr, u16 ids, u64 *id, + perf_event__handler_t process) { union perf_event *ev; size_t size; @@ -2092,22 +2026,23 @@ int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, ev->attr.header.type = PERF_RECORD_HEADER_ATTR; ev->attr.header.size = size; - err = process(ev, NULL, session); + err = process(tool, ev, NULL, NULL); free(ev); return err; } -int perf_session__synthesize_attrs(struct perf_session *session, +int perf_event__synthesize_attrs(struct perf_tool *tool, + struct perf_session *session, perf_event__handler_t process) { struct perf_evsel *attr; int err = 0; list_for_each_entry(attr, &session->evlist->entries, node) { - err = perf_event__synthesize_attr(&attr->attr, attr->ids, - attr->id, process, session); + err = perf_event__synthesize_attr(tool, &attr->attr, attr->ids, + attr->id, process); if (err) { pr_debug("failed to create perf header attribute\n"); return err; @@ -2118,23 +2053,23 @@ int perf_session__synthesize_attrs(struct perf_session *session, } int perf_event__process_attr(union perf_event *event, - struct perf_session *session) + struct perf_evlist **pevlist) { unsigned int i, ids, n_ids; struct perf_evsel *evsel; + struct perf_evlist *evlist = *pevlist; - if (session->evlist == NULL) { - session->evlist = perf_evlist__new(NULL, NULL); - if (session->evlist == NULL) + if (evlist == NULL) { + *pevlist = evlist = perf_evlist__new(NULL, NULL); + if (evlist == NULL) return -ENOMEM; } - evsel = perf_evsel__new(&event->attr.attr, - session->evlist->nr_entries); + evsel = perf_evsel__new(&event->attr.attr, evlist->nr_entries); if (evsel == NULL) return -ENOMEM; - perf_evlist__add(session->evlist, evsel); + perf_evlist__add(evlist, evsel); ids = event->header.size; ids -= (void *)&event->attr.id - (void *)event; @@ -2148,18 +2083,16 @@ int perf_event__process_attr(union perf_event *event, return -ENOMEM; for (i = 0; i < n_ids; i++) { - perf_evlist__id_add(session->evlist, evsel, 0, i, - event->attr.id[i]); + perf_evlist__id_add(evlist, evsel, 0, i, event->attr.id[i]); } - perf_session__update_sample_type(session); - return 0; } -int perf_event__synthesize_event_type(u64 event_id, char *name, +int perf_event__synthesize_event_type(struct perf_tool *tool, + u64 event_id, char *name, perf_event__handler_t process, - struct perf_session *session) + struct machine *machine) { union perf_event ev; size_t size = 0; @@ -2177,13 +2110,14 @@ int perf_event__synthesize_event_type(u64 event_id, char *name, ev.event_type.header.size = sizeof(ev.event_type) - (sizeof(ev.event_type.event_type.name) - size); - err = process(&ev, NULL, session); + err = process(tool, &ev, NULL, machine); return err; } -int perf_event__synthesize_event_types(perf_event__handler_t process, - struct perf_session *session) +int perf_event__synthesize_event_types(struct perf_tool *tool, + perf_event__handler_t process, + struct machine *machine) { struct perf_trace_event_type *type; int i, err = 0; @@ -2191,9 +2125,9 @@ int perf_event__synthesize_event_types(perf_event__handler_t process, for (i = 0; i < event_count; i++) { type = &events[i]; - err = perf_event__synthesize_event_type(type->event_id, + err = perf_event__synthesize_event_type(tool, type->event_id, type->name, process, - session); + machine); if (err) { pr_debug("failed to create perf header event type\n"); return err; @@ -2203,8 +2137,8 @@ int perf_event__synthesize_event_types(perf_event__handler_t process, return err; } -int perf_event__process_event_type(union perf_event *event, - struct perf_session *session __unused) +int perf_event__process_event_type(struct perf_tool *tool __unused, + union perf_event *event) { if (perf_header__push_event(event->event_type.event_type.event_id, event->event_type.event_type.name) < 0) @@ -2213,9 +2147,9 @@ int perf_event__process_event_type(union perf_event *event, return 0; } -int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist, - perf_event__handler_t process, - struct perf_session *session __unused) +int perf_event__synthesize_tracing_data(struct perf_tool *tool, int fd, + struct perf_evlist *evlist, + perf_event__handler_t process) { union perf_event ev; struct tracing_data *tdata; @@ -2246,7 +2180,7 @@ int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist, ev.tracing_data.header.size = sizeof(ev.tracing_data); ev.tracing_data.size = aligned_size; - process(&ev, NULL, session); + process(tool, &ev, NULL, NULL); /* * The put function will copy all the tracing data @@ -2288,10 +2222,10 @@ int perf_event__process_tracing_data(union perf_event *event, return size_read + padding; } -int perf_event__synthesize_build_id(struct dso *pos, u16 misc, +int perf_event__synthesize_build_id(struct perf_tool *tool, + struct dso *pos, u16 misc, perf_event__handler_t process, - struct machine *machine, - struct perf_session *session) + struct machine *machine) { union perf_event ev; size_t len; @@ -2311,12 +2245,13 @@ int perf_event__synthesize_build_id(struct dso *pos, u16 misc, ev.build_id.header.size = sizeof(ev.build_id) + len; memcpy(&ev.build_id.filename, pos->long_name, pos->long_name_len); - err = process(&ev, NULL, session); + err = process(tool, &ev, NULL, machine); return err; } -int perf_event__process_build_id(union perf_event *event, +int perf_event__process_build_id(struct perf_tool *tool __used, + union perf_event *event, struct perf_session *session) { __event_process_build_id(&event->build_id, diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 3d5a742f4a2..ac4ec956024 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -10,7 +10,8 @@ #include <linux/bitmap.h> enum { - HEADER_TRACE_INFO = 1, + HEADER_RESERVED = 0, /* always cleared */ + HEADER_TRACE_INFO = 1, HEADER_BUILD_ID, HEADER_HOSTNAME, @@ -27,10 +28,9 @@ enum { HEADER_NUMA_TOPOLOGY, HEADER_LAST_FEATURE, + HEADER_FEAT_BITS = 256, }; -#define HEADER_FEAT_BITS 256 - struct perf_file_section { u64 offset; u64 size; @@ -68,6 +68,7 @@ struct perf_header { }; struct perf_evlist; +struct perf_session; int perf_session__read_header(struct perf_session *session, int fd); int perf_session__write_header(struct perf_session *session, @@ -96,32 +97,36 @@ int build_id_cache__add_s(const char *sbuild_id, const char *debugdir, const char *name, bool is_kallsyms); int build_id_cache__remove_s(const char *sbuild_id, const char *debugdir); -int perf_event__synthesize_attr(struct perf_event_attr *attr, u16 ids, u64 *id, - perf_event__handler_t process, - struct perf_session *session); -int perf_session__synthesize_attrs(struct perf_session *session, - perf_event__handler_t process); -int perf_event__process_attr(union perf_event *event, struct perf_session *session); +int perf_event__synthesize_attr(struct perf_tool *tool, + struct perf_event_attr *attr, u16 ids, u64 *id, + perf_event__handler_t process); +int perf_event__synthesize_attrs(struct perf_tool *tool, + struct perf_session *session, + perf_event__handler_t process); +int perf_event__process_attr(union perf_event *event, struct perf_evlist **pevlist); -int perf_event__synthesize_event_type(u64 event_id, char *name, +int perf_event__synthesize_event_type(struct perf_tool *tool, + u64 event_id, char *name, perf_event__handler_t process, - struct perf_session *session); -int perf_event__synthesize_event_types(perf_event__handler_t process, - struct perf_session *session); -int perf_event__process_event_type(union perf_event *event, - struct perf_session *session); - -int perf_event__synthesize_tracing_data(int fd, struct perf_evlist *evlist, - perf_event__handler_t process, - struct perf_session *session); + struct machine *machine); +int perf_event__synthesize_event_types(struct perf_tool *tool, + perf_event__handler_t process, + struct machine *machine); +int perf_event__process_event_type(struct perf_tool *tool, + union perf_event *event); + +int perf_event__synthesize_tracing_data(struct perf_tool *tool, + int fd, struct perf_evlist *evlist, + perf_event__handler_t process); int perf_event__process_tracing_data(union perf_event *event, struct perf_session *session); -int perf_event__synthesize_build_id(struct dso *pos, u16 misc, +int perf_event__synthesize_build_id(struct perf_tool *tool, + struct dso *pos, u16 misc, perf_event__handler_t process, - struct machine *machine, - struct perf_session *session); -int perf_event__process_build_id(union perf_event *event, + struct machine *machine); +int perf_event__process_build_id(struct perf_tool *tool, + union perf_event *event, struct perf_session *session); /* diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 89289c8e935..ff6f9d56ea4 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -117,7 +117,6 @@ int perf_evlist__tui_browse_hists(struct perf_evlist *evlist __used, static inline int hist_entry__tui_annotate(struct hist_entry *self __used, int evidx __used, - int nr_events __used, void(*timer)(void *arg) __used, void *arg __used, int delay_secs __used) @@ -128,7 +127,7 @@ static inline int hist_entry__tui_annotate(struct hist_entry *self __used, #define K_RIGHT -2 #else #include "ui/keysyms.h" -int hist_entry__tui_annotate(struct hist_entry *he, int evidx, int nr_events, +int hist_entry__tui_annotate(struct hist_entry *he, int evidx, void(*timer)(void *arg), void *arg, int delay_secs); int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help, diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h index 305c8484f20..62cdee78db7 100644 --- a/tools/perf/util/include/linux/bitops.h +++ b/tools/perf/util/include/linux/bitops.h @@ -9,6 +9,17 @@ #define BITS_PER_BYTE 8 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) +#define for_each_set_bit(bit, addr, size) \ + for ((bit) = find_first_bit((addr), (size)); \ + (bit) < (size); \ + (bit) = find_next_bit((addr), (size), (bit) + 1)) + +/* same as for_each_set_bit() but use bit as value to start with */ +#define for_each_set_bit_cont(bit, addr, size) \ + for ((bit) = find_next_bit((addr), (size), (bit)); \ + (bit) < (size); \ + (bit) = find_next_bit((addr), (size), (bit) + 1)) + static inline void set_bit(int nr, unsigned long *addr) { addr[nr / BITS_PER_LONG] |= 1UL << (nr % BITS_PER_LONG); @@ -30,4 +41,111 @@ static inline unsigned long hweight_long(unsigned long w) return sizeof(w) == 4 ? hweight32(w) : hweight64(w); } +#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) + +/** + * __ffs - find first bit in word. + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static __always_inline unsigned long __ffs(unsigned long word) +{ + int num = 0; + +#if BITS_PER_LONG == 64 + if ((word & 0xffffffff) == 0) { + num += 32; + word >>= 32; + } +#endif + if ((word & 0xffff) == 0) { + num += 16; + word >>= 16; + } + if ((word & 0xff) == 0) { + num += 8; + word >>= 8; + } + if ((word & 0xf) == 0) { + num += 4; + word >>= 4; + } + if ((word & 0x3) == 0) { + num += 2; + word >>= 2; + } + if ((word & 0x1) == 0) + num += 1; + return num; +} + +/* + * Find the first set bit in a memory region. + */ +static inline unsigned long +find_first_bit(const unsigned long *addr, unsigned long size) +{ + const unsigned long *p = addr; + unsigned long result = 0; + unsigned long tmp; + + while (size & ~(BITS_PER_LONG-1)) { + if ((tmp = *(p++))) + goto found; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + + tmp = (*p) & (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found: + return result + __ffs(tmp); +} + +/* + * Find the next set bit in a memory region. + */ +static inline unsigned long +find_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) +{ + const unsigned long *p = addr + BITOP_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG-1); + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset %= BITS_PER_LONG; + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < BITS_PER_LONG) + goto found_first; + if (tmp) + goto found_middle; + size -= BITS_PER_LONG; + result += BITS_PER_LONG; + } + while (size & ~(BITS_PER_LONG-1)) { + if ((tmp = *(p++))) + goto found_middle; + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} + #endif diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 78284b13e80..316aa0ab712 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -562,6 +562,10 @@ int machine__init(struct machine *self, const char *root_dir, pid_t pid) INIT_LIST_HEAD(&self->user_dsos); INIT_LIST_HEAD(&self->kernel_dsos); + self->threads = RB_ROOT; + INIT_LIST_HEAD(&self->dead_threads); + self->last_match = NULL; + self->kmaps.machine = self; self->pid = pid; self->root_dir = strdup(root_dir); diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 890d85545d0..2b8017f8a93 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -18,9 +18,11 @@ enum map_type { extern const char *map_type__name[MAP__NR_TYPES]; struct dso; +struct ip_callchain; struct ref_reloc_sym; struct map_groups; struct machine; +struct perf_evsel; struct map { union { @@ -61,7 +63,11 @@ struct map_groups { struct machine { struct rb_node rb_node; pid_t pid; + u16 id_hdr_size; char *root_dir; + struct rb_root threads; + struct list_head dead_threads; + struct thread *last_match; struct list_head user_dsos; struct list_head kernel_dsos; struct map_groups kmaps; @@ -148,6 +154,13 @@ int machine__init(struct machine *self, const char *root_dir, pid_t pid); void machine__exit(struct machine *self); void machine__delete(struct machine *self); +int machine__resolve_callchain(struct machine *machine, + struct perf_evsel *evsel, struct thread *thread, + struct ip_callchain *chain, + struct symbol **parent); +int maps__set_kallsyms_ref_reloc_sym(struct map **maps, const char *symbol_name, + u64 addr); + /* * Default guest kernel is defined by parameter --guestkallsyms * and --guestmodules @@ -190,6 +203,12 @@ struct symbol *map_groups__find_symbol_by_name(struct map_groups *mg, struct map **mapp, symbol_filter_t filter); + +struct thread *machine__findnew_thread(struct machine *machine, pid_t pid); +void machine__remove_thread(struct machine *machine, struct thread *th); + +size_t machine__fprintf(struct machine *machine, FILE *fp); + static inline struct symbol *machine__find_kernel_symbol(struct machine *self, enum map_type type, u64 addr, diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 928918b796b..531c283fc0c 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -25,8 +25,6 @@ enum event_result { EVT_HANDLED_ALL }; -char debugfs_path[MAXPATHLEN]; - #define CHW(x) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_##x #define CSW(x) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_##x @@ -40,6 +38,7 @@ static struct event_symbol event_symbols[] = { { CHW(BRANCH_INSTRUCTIONS), "branch-instructions", "branches" }, { CHW(BRANCH_MISSES), "branch-misses", "" }, { CHW(BUS_CYCLES), "bus-cycles", "" }, + { CHW(REF_CPU_CYCLES), "ref-cycles", "" }, { CSW(CPU_CLOCK), "cpu-clock", "" }, { CSW(TASK_CLOCK), "task-clock", "" }, @@ -70,6 +69,7 @@ static const char *hw_event_names[PERF_COUNT_HW_MAX] = { "bus-cycles", "stalled-cycles-frontend", "stalled-cycles-backend", + "ref-cycles", }; static const char *sw_event_names[PERF_COUNT_SW_MAX] = { @@ -140,7 +140,7 @@ static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir) char evt_path[MAXPATHLEN]; int fd; - snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path, + snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", tracing_events_path, sys_dir->d_name, evt_dir->d_name); fd = open(evt_path, O_RDONLY); if (fd < 0) @@ -171,16 +171,16 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config) char evt_path[MAXPATHLEN]; char dir_path[MAXPATHLEN]; - if (debugfs_valid_mountpoint(debugfs_path)) + if (debugfs_valid_mountpoint(tracing_events_path)) return NULL; - sys_dir = opendir(debugfs_path); + sys_dir = opendir(tracing_events_path); if (!sys_dir) return NULL; for_each_subsystem(sys_dir, sys_dirent, sys_next) { - snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path, + snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path, sys_dirent.d_name); evt_dir = opendir(dir_path); if (!evt_dir) @@ -447,7 +447,7 @@ parse_single_tracepoint_event(char *sys_name, u64 id; int fd; - snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path, + snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", tracing_events_path, sys_name, evt_name); fd = open(evt_path, O_RDONLY); @@ -485,7 +485,7 @@ parse_multiple_tracepoint_event(struct perf_evlist *evlist, char *sys_name, struct dirent *evt_ent; DIR *evt_dir; - snprintf(evt_path, MAXPATHLEN, "%s/%s", debugfs_path, sys_name); + snprintf(evt_path, MAXPATHLEN, "%s/%s", tracing_events_path, sys_name); evt_dir = opendir(evt_path); if (!evt_dir) { @@ -528,7 +528,7 @@ parse_tracepoint_event(struct perf_evlist *evlist, const char **strp, char sys_name[MAX_EVENT_LENGTH]; unsigned int sys_length, evt_length; - if (debugfs_valid_mountpoint(debugfs_path)) + if (debugfs_valid_mountpoint(tracing_events_path)) return 0; evt_name = strchr(*strp, ':'); @@ -920,10 +920,10 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob) char evt_path[MAXPATHLEN]; char dir_path[MAXPATHLEN]; - if (debugfs_valid_mountpoint(debugfs_path)) + if (debugfs_valid_mountpoint(tracing_events_path)) return; - sys_dir = opendir(debugfs_path); + sys_dir = opendir(tracing_events_path); if (!sys_dir) return; @@ -932,7 +932,7 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob) !strglobmatch(sys_dirent.d_name, subsys_glob)) continue; - snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path, + snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path, sys_dirent.d_name); evt_dir = opendir(dir_path); if (!evt_dir) @@ -964,16 +964,16 @@ int is_valid_tracepoint(const char *event_string) char evt_path[MAXPATHLEN]; char dir_path[MAXPATHLEN]; - if (debugfs_valid_mountpoint(debugfs_path)) + if (debugfs_valid_mountpoint(tracing_events_path)) return 0; - sys_dir = opendir(debugfs_path); + sys_dir = opendir(tracing_events_path); if (!sys_dir) return 0; for_each_subsystem(sys_dir, sys_dirent, sys_next) { - snprintf(dir_path, MAXPATHLEN, "%s/%s", debugfs_path, + snprintf(dir_path, MAXPATHLEN, "%s/%s", tracing_events_path, sys_dirent.d_name); evt_dir = opendir(dir_path); if (!evt_dir) diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 2f8e375e038..7e0cbe75d5f 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -39,7 +39,6 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob); int print_hwcache_events(const char *event_glob); extern int is_valid_tracepoint(const char *event_string); -extern char debugfs_path[]; extern int valid_debugfs_mount(const char *debugfs); #endif /* __PERF_PARSE_EVENTS_H */ diff --git a/tools/perf/util/probe-finder.h b/tools/perf/util/probe-finder.h index 1132c8f0ce8..17e94d0c36f 100644 --- a/tools/perf/util/probe-finder.h +++ b/tools/perf/util/probe-finder.h @@ -5,7 +5,6 @@ #include "util.h" #include "probe-event.h" -#define MAX_PATH_LEN 256 #define MAX_PROBE_BUFFER 1024 #define MAX_PROBES 128 diff --git a/tools/perf/util/scripting-engines/trace-event-perl.c b/tools/perf/util/scripting-engines/trace-event-perl.c index 74350ffb57f..e30749e38a9 100644 --- a/tools/perf/util/scripting-engines/trace-event-perl.c +++ b/tools/perf/util/scripting-engines/trace-event-perl.c @@ -27,7 +27,10 @@ #include "../../perf.h" #include "../util.h" +#include "../thread.h" +#include "../event.h" #include "../trace-event.h" +#include "../evsel.h" #include <EXTERN.h> #include <perl.h> @@ -245,11 +248,11 @@ static inline struct event *find_cache_event(int type) return event; } -static void perl_process_event(union perf_event *pevent __unused, - struct perf_sample *sample, - struct perf_evsel *evsel, - struct perf_session *session __unused, - struct thread *thread) +static void perl_process_tracepoint(union perf_event *pevent __unused, + struct perf_sample *sample, + struct perf_evsel *evsel, + struct machine *machine __unused, + struct thread *thread) { struct format_field *field; static char handler[256]; @@ -265,6 +268,9 @@ static void perl_process_event(union perf_event *pevent __unused, dSP; + if (evsel->attr.type != PERF_TYPE_TRACEPOINT) + return; + type = trace_parse_common_type(data); event = find_cache_event(type); @@ -332,6 +338,42 @@ static void perl_process_event(union perf_event *pevent __unused, LEAVE; } +static void perl_process_event_generic(union perf_event *pevent __unused, + struct perf_sample *sample, + struct perf_evsel *evsel __unused, + struct machine *machine __unused, + struct thread *thread __unused) +{ + dSP; + + if (!get_cv("process_event", 0)) + return; + + ENTER; + SAVETMPS; + PUSHMARK(SP); + XPUSHs(sv_2mortal(newSVpvn((const char *)pevent, pevent->header.size))); + XPUSHs(sv_2mortal(newSVpvn((const char *)&evsel->attr, sizeof(evsel->attr)))); + XPUSHs(sv_2mortal(newSVpvn((const char *)sample, sizeof(*sample)))); + XPUSHs(sv_2mortal(newSVpvn((const char *)sample->raw_data, sample->raw_size))); + PUTBACK; + call_pv("process_event", G_SCALAR); + SPAGAIN; + PUTBACK; + FREETMPS; + LEAVE; +} + +static void perl_process_event(union perf_event *pevent, + struct perf_sample *sample, + struct perf_evsel *evsel, + struct machine *machine, + struct thread *thread) +{ + perl_process_tracepoint(pevent, sample, evsel, machine, thread); + perl_process_event_generic(pevent, sample, evsel, machine, thread); +} + static void run_start_sub(void) { dSP; /* access to Perl stack */ @@ -553,7 +595,28 @@ static int perl_generate_script(const char *outfile) fprintf(ofp, "sub print_header\n{\n" "\tmy ($event_name, $cpu, $secs, $nsecs, $pid, $comm) = @_;\n\n" "\tprintf(\"%%-20s %%5u %%05u.%%09u %%8u %%-20s \",\n\t " - "$event_name, $cpu, $secs, $nsecs, $pid, $comm);\n}"); + "$event_name, $cpu, $secs, $nsecs, $pid, $comm);\n}\n"); + + fprintf(ofp, + "\n# Packed byte string args of process_event():\n" + "#\n" + "# $event:\tunion perf_event\tutil/event.h\n" + "# $attr:\tstruct perf_event_attr\tlinux/perf_event.h\n" + "# $sample:\tstruct perf_sample\tutil/event.h\n" + "# $raw_data:\tperf_sample->raw_data\tutil/event.h\n" + "\n" + "sub process_event\n" + "{\n" + "\tmy ($event, $attr, $sample, $raw_data) = @_;\n" + "\n" + "\tmy @event\t= unpack(\"LSS\", $event);\n" + "\tmy @attr\t= unpack(\"LLQQQQQLLQQ\", $attr);\n" + "\tmy @sample\t= unpack(\"QLLQQQQQLL\", $sample);\n" + "\tmy @raw_data\t= unpack(\"C*\", $raw_data);\n" + "\n" + "\tuse Data::Dumper;\n" + "\tprint Dumper \\@event, \\@attr, \\@sample, \\@raw_data;\n" + "}\n"); fclose(ofp); diff --git a/tools/perf/util/scripting-engines/trace-event-python.c b/tools/perf/util/scripting-engines/trace-event-python.c index 6ccf70e8d8f..0b2a4878317 100644 --- a/tools/perf/util/scripting-engines/trace-event-python.c +++ b/tools/perf/util/scripting-engines/trace-event-python.c @@ -29,6 +29,8 @@ #include "../../perf.h" #include "../util.h" +#include "../event.h" +#include "../thread.h" #include "../trace-event.h" PyMODINIT_FUNC initperf_trace_context(void); @@ -207,7 +209,7 @@ static inline struct event *find_cache_event(int type) static void python_process_event(union perf_event *pevent __unused, struct perf_sample *sample, struct perf_evsel *evsel __unused, - struct perf_session *session __unused, + struct machine *machine __unused, struct thread *thread) { PyObject *handler, *retval, *context, *t, *obj, *dict = NULL; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 0f4555ce906..b5ca2558c7b 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -10,6 +10,7 @@ #include "evlist.h" #include "evsel.h" #include "session.h" +#include "tool.h" #include "sort.h" #include "util.h" #include "cpumap.h" @@ -78,39 +79,13 @@ out_close: return -1; } -static void perf_session__id_header_size(struct perf_session *session) -{ - struct perf_sample *data; - u64 sample_type = session->sample_type; - u16 size = 0; - - if (!session->sample_id_all) - goto out; - - if (sample_type & PERF_SAMPLE_TID) - size += sizeof(data->tid) * 2; - - if (sample_type & PERF_SAMPLE_TIME) - size += sizeof(data->time); - - if (sample_type & PERF_SAMPLE_ID) - size += sizeof(data->id); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - size += sizeof(data->stream_id); - - if (sample_type & PERF_SAMPLE_CPU) - size += sizeof(data->cpu) * 2; -out: - session->id_hdr_size = size; -} - void perf_session__update_sample_type(struct perf_session *self) { self->sample_type = perf_evlist__sample_type(self->evlist); self->sample_size = __perf_evsel__sample_size(self->sample_type); self->sample_id_all = perf_evlist__sample_id_all(self->evlist); - perf_session__id_header_size(self); + self->id_hdr_size = perf_evlist__id_hdr_size(self->evlist); + self->host_machine.id_hdr_size = self->id_hdr_size; } int perf_session__create_kernel_maps(struct perf_session *self) @@ -130,18 +105,26 @@ static void perf_session__destroy_kernel_maps(struct perf_session *self) struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe, - struct perf_event_ops *ops) + struct perf_tool *tool) { - size_t len = filename ? strlen(filename) + 1 : 0; - struct perf_session *self = zalloc(sizeof(*self) + len); + struct perf_session *self; + struct stat st; + size_t len; + + if (!filename || !strlen(filename)) { + if (!fstat(STDIN_FILENO, &st) && S_ISFIFO(st.st_mode)) + filename = "-"; + else + filename = "perf.data"; + } + + len = strlen(filename); + self = zalloc(sizeof(*self) + len); if (self == NULL) goto out; memcpy(self->filename, filename, len); - self->threads = RB_ROOT; - INIT_LIST_HEAD(&self->dead_threads); - self->last_match = NULL; /* * On 64bit we can mmap the data file in one go. No need for tiny mmap * slices. On 32bit we use 32MB. @@ -171,10 +154,10 @@ struct perf_session *perf_session__new(const char *filename, int mode, goto out_delete; } - if (ops && ops->ordering_requires_timestamps && - ops->ordered_samples && !self->sample_id_all) { + if (tool && tool->ordering_requires_timestamps && + tool->ordered_samples && !self->sample_id_all) { dump_printf("WARNING: No sample_id_all support, falling back to unordered processing\n"); - ops->ordered_samples = false; + tool->ordered_samples = false; } out: @@ -184,17 +167,22 @@ out_delete: return NULL; } -static void perf_session__delete_dead_threads(struct perf_session *self) +static void machine__delete_dead_threads(struct machine *machine) { struct thread *n, *t; - list_for_each_entry_safe(t, n, &self->dead_threads, node) { + list_for_each_entry_safe(t, n, &machine->dead_threads, node) { list_del(&t->node); thread__delete(t); } } -static void perf_session__delete_threads(struct perf_session *self) +static void perf_session__delete_dead_threads(struct perf_session *session) +{ + machine__delete_dead_threads(&session->host_machine); +} + +static void machine__delete_threads(struct machine *self) { struct rb_node *nd = rb_first(&self->threads); @@ -207,6 +195,11 @@ static void perf_session__delete_threads(struct perf_session *self) } } +static void perf_session__delete_threads(struct perf_session *session) +{ + machine__delete_threads(&session->host_machine); +} + void perf_session__delete(struct perf_session *self) { perf_session__destroy_kernel_maps(self); @@ -217,7 +210,7 @@ void perf_session__delete(struct perf_session *self) free(self); } -void perf_session__remove_thread(struct perf_session *self, struct thread *th) +void machine__remove_thread(struct machine *self, struct thread *th) { self->last_match = NULL; rb_erase(&th->rb_node, &self->threads); @@ -236,16 +229,16 @@ static bool symbol__match_parent_regex(struct symbol *sym) return 0; } -int perf_session__resolve_callchain(struct perf_session *self, - struct thread *thread, - struct ip_callchain *chain, - struct symbol **parent) +int machine__resolve_callchain(struct machine *self, struct perf_evsel *evsel, + struct thread *thread, + struct ip_callchain *chain, + struct symbol **parent) { u8 cpumode = PERF_RECORD_MISC_USER; unsigned int i; int err; - callchain_cursor_reset(&self->callchain_cursor); + callchain_cursor_reset(&evsel->hists.callchain_cursor); for (i = 0; i < chain->nr; i++) { u64 ip; @@ -272,7 +265,7 @@ int perf_session__resolve_callchain(struct perf_session *self, al.filtered = false; thread__find_addr_location(thread, self, cpumode, - MAP__FUNCTION, thread->pid, ip, &al, NULL); + MAP__FUNCTION, ip, &al, NULL); if (al.sym != NULL) { if (sort__has_parent && !*parent && symbol__match_parent_regex(al.sym)) @@ -281,7 +274,7 @@ int perf_session__resolve_callchain(struct perf_session *self, break; } - err = callchain_cursor_append(&self->callchain_cursor, + err = callchain_cursor_append(&evsel->hists.callchain_cursor, ip, al.map, al.sym); if (err) return err; @@ -290,75 +283,91 @@ int perf_session__resolve_callchain(struct perf_session *self, return 0; } -static int process_event_synth_stub(union perf_event *event __used, - struct perf_session *session __used) +static int process_event_synth_tracing_data_stub(union perf_event *event __used, + struct perf_session *session __used) +{ + dump_printf(": unhandled!\n"); + return 0; +} + +static int process_event_synth_attr_stub(union perf_event *event __used, + struct perf_evlist **pevlist __used) { dump_printf(": unhandled!\n"); return 0; } -static int process_event_sample_stub(union perf_event *event __used, +static int process_event_sample_stub(struct perf_tool *tool __used, + union perf_event *event __used, struct perf_sample *sample __used, struct perf_evsel *evsel __used, - struct perf_session *session __used) + struct machine *machine __used) { dump_printf(": unhandled!\n"); return 0; } -static int process_event_stub(union perf_event *event __used, +static int process_event_stub(struct perf_tool *tool __used, + union perf_event *event __used, struct perf_sample *sample __used, - struct perf_session *session __used) + struct machine *machine __used) { dump_printf(": unhandled!\n"); return 0; } -static int process_finished_round_stub(union perf_event *event __used, - struct perf_session *session __used, - struct perf_event_ops *ops __used) +static int process_finished_round_stub(struct perf_tool *tool __used, + union perf_event *event __used, + struct perf_session *perf_session __used) { dump_printf(": unhandled!\n"); return 0; } -static int process_finished_round(union perf_event *event, - struct perf_session *session, - struct perf_event_ops *ops); +static int process_event_type_stub(struct perf_tool *tool __used, + union perf_event *event __used) +{ + dump_printf(": unhandled!\n"); + return 0; +} -static void perf_event_ops__fill_defaults(struct perf_event_ops *handler) +static int process_finished_round(struct perf_tool *tool, + union perf_event *event, + struct perf_session *session); + +static void perf_tool__fill_defaults(struct perf_tool *tool) { - if (handler->sample == NULL) - handler->sample = process_event_sample_stub; - if (handler->mmap == NULL) - handler->mmap = process_event_stub; - if (handler->comm == NULL) - handler->comm = process_event_stub; - if (handler->fork == NULL) - handler->fork = process_event_stub; - if (handler->exit == NULL) - handler->exit = process_event_stub; - if (handler->lost == NULL) - handler->lost = perf_event__process_lost; - if (handler->read == NULL) - handler->read = process_event_stub; - if (handler->throttle == NULL) - handler->throttle = process_event_stub; - if (handler->unthrottle == NULL) - handler->unthrottle = process_event_stub; - if (handler->attr == NULL) - handler->attr = process_event_synth_stub; - if (handler->event_type == NULL) - handler->event_type = process_event_synth_stub; - if (handler->tracing_data == NULL) - handler->tracing_data = process_event_synth_stub; - if (handler->build_id == NULL) - handler->build_id = process_event_synth_stub; - if (handler->finished_round == NULL) { - if (handler->ordered_samples) - handler->finished_round = process_finished_round; + if (tool->sample == NULL) + tool->sample = process_event_sample_stub; + if (tool->mmap == NULL) + tool->mmap = process_event_stub; + if (tool->comm == NULL) + tool->comm = process_event_stub; + if (tool->fork == NULL) + tool->fork = process_event_stub; + if (tool->exit == NULL) + tool->exit = process_event_stub; + if (tool->lost == NULL) + tool->lost = perf_event__process_lost; + if (tool->read == NULL) + tool->read = process_event_sample_stub; + if (tool->throttle == NULL) + tool->throttle = process_event_stub; + if (tool->unthrottle == NULL) + tool->unthrottle = process_event_stub; + if (tool->attr == NULL) + tool->attr = process_event_synth_attr_stub; + if (tool->event_type == NULL) + tool->event_type = process_event_type_stub; + if (tool->tracing_data == NULL) + tool->tracing_data = process_event_synth_tracing_data_stub; + if (tool->build_id == NULL) + tool->build_id = process_finished_round_stub; + if (tool->finished_round == NULL) { + if (tool->ordered_samples) + tool->finished_round = process_finished_round; else - handler->finished_round = process_finished_round_stub; + tool->finished_round = process_finished_round_stub; } } @@ -490,11 +499,11 @@ static void perf_session_free_sample_buffers(struct perf_session *session) static int perf_session_deliver_event(struct perf_session *session, union perf_event *event, struct perf_sample *sample, - struct perf_event_ops *ops, + struct perf_tool *tool, u64 file_offset); static void flush_sample_queue(struct perf_session *s, - struct perf_event_ops *ops) + struct perf_tool *tool) { struct ordered_samples *os = &s->ordered_samples; struct list_head *head = &os->samples; @@ -505,7 +514,7 @@ static void flush_sample_queue(struct perf_session *s, unsigned idx = 0, progress_next = os->nr_samples / 16; int ret; - if (!ops->ordered_samples || !limit) + if (!tool->ordered_samples || !limit) return; list_for_each_entry_safe(iter, tmp, head, list) { @@ -516,7 +525,7 @@ static void flush_sample_queue(struct perf_session *s, if (ret) pr_err("Can't parse sample, err = %d\n", ret); else - perf_session_deliver_event(s, iter->event, &sample, ops, + perf_session_deliver_event(s, iter->event, &sample, tool, iter->file_offset); os->last_flush = iter->timestamp; @@ -578,11 +587,11 @@ static void flush_sample_queue(struct perf_session *s, * Flush every events below timestamp 7 * etc... */ -static int process_finished_round(union perf_event *event __used, - struct perf_session *session, - struct perf_event_ops *ops) +static int process_finished_round(struct perf_tool *tool, + union perf_event *event __used, + struct perf_session *session) { - flush_sample_queue(session, ops); + flush_sample_queue(session, tool); session->ordered_samples.next_flush = session->ordered_samples.max_timestamp; return 0; @@ -737,13 +746,26 @@ static void dump_sample(struct perf_session *session, union perf_event *event, callchain__printf(sample); } +static struct machine * + perf_session__find_machine_for_cpumode(struct perf_session *session, + union perf_event *event) +{ + const u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; + + if (cpumode == PERF_RECORD_MISC_GUEST_KERNEL && perf_guest) + return perf_session__find_machine(session, event->ip.pid); + + return perf_session__find_host_machine(session); +} + static int perf_session_deliver_event(struct perf_session *session, union perf_event *event, struct perf_sample *sample, - struct perf_event_ops *ops, + struct perf_tool *tool, u64 file_offset) { struct perf_evsel *evsel; + struct machine *machine; dump_event(session, event, file_offset, sample); @@ -765,6 +787,8 @@ static int perf_session_deliver_event(struct perf_session *session, hists__inc_nr_events(&evsel->hists, event->header.type); } + machine = perf_session__find_machine_for_cpumode(session, event); + switch (event->header.type) { case PERF_RECORD_SAMPLE: dump_sample(session, event, sample); @@ -772,23 +796,25 @@ static int perf_session_deliver_event(struct perf_session *session, ++session->hists.stats.nr_unknown_id; return -1; } - return ops->sample(event, sample, evsel, session); + return tool->sample(tool, event, sample, evsel, machine); case PERF_RECORD_MMAP: - return ops->mmap(event, sample, session); + return tool->mmap(tool, event, sample, machine); case PERF_RECORD_COMM: - return ops->comm(event, sample, session); + return tool->comm(tool, event, sample, machine); case PERF_RECORD_FORK: - return ops->fork(event, sample, session); + return tool->fork(tool, event, sample, machine); case PERF_RECORD_EXIT: - return ops->exit(event, sample, session); + return tool->exit(tool, event, sample, machine); case PERF_RECORD_LOST: - return ops->lost(event, sample, session); + if (tool->lost == perf_event__process_lost) + session->hists.stats.total_lost += event->lost.lost; + return tool->lost(tool, event, sample, machine); case PERF_RECORD_READ: - return ops->read(event, sample, session); + return tool->read(tool, event, sample, evsel, machine); case PERF_RECORD_THROTTLE: - return ops->throttle(event, sample, session); + return tool->throttle(tool, event, sample, machine); case PERF_RECORD_UNTHROTTLE: - return ops->unthrottle(event, sample, session); + return tool->unthrottle(tool, event, sample, machine); default: ++session->hists.stats.nr_unknown_events; return -1; @@ -812,24 +838,29 @@ static int perf_session__preprocess_sample(struct perf_session *session, } static int perf_session__process_user_event(struct perf_session *session, union perf_event *event, - struct perf_event_ops *ops, u64 file_offset) + struct perf_tool *tool, u64 file_offset) { + int err; + dump_event(session, event, file_offset, NULL); /* These events are processed right away */ switch (event->header.type) { case PERF_RECORD_HEADER_ATTR: - return ops->attr(event, session); + err = tool->attr(event, &session->evlist); + if (err == 0) + perf_session__update_sample_type(session); + return err; case PERF_RECORD_HEADER_EVENT_TYPE: - return ops->event_type(event, session); + return tool->event_type(tool, event); case PERF_RECORD_HEADER_TRACING_DATA: /* setup for reading amidst mmap */ lseek(session->fd, file_offset, SEEK_SET); - return ops->tracing_data(event, session); + return tool->tracing_data(event, session); case PERF_RECORD_HEADER_BUILD_ID: - return ops->build_id(event, session); + return tool->build_id(tool, event, session); case PERF_RECORD_FINISHED_ROUND: - return ops->finished_round(event, session, ops); + return tool->finished_round(tool, event, session); default: return -EINVAL; } @@ -837,7 +868,7 @@ static int perf_session__process_user_event(struct perf_session *session, union static int perf_session__process_event(struct perf_session *session, union perf_event *event, - struct perf_event_ops *ops, + struct perf_tool *tool, u64 file_offset) { struct perf_sample sample; @@ -853,7 +884,7 @@ static int perf_session__process_event(struct perf_session *session, hists__inc_nr_events(&session->hists, event->header.type); if (event->header.type >= PERF_RECORD_USER_TYPE_START) - return perf_session__process_user_event(session, event, ops, file_offset); + return perf_session__process_user_event(session, event, tool, file_offset); /* * For all kernel events we get the sample data @@ -866,14 +897,14 @@ static int perf_session__process_event(struct perf_session *session, if (perf_session__preprocess_sample(session, event, &sample)) return 0; - if (ops->ordered_samples) { + if (tool->ordered_samples) { ret = perf_session_queue_event(session, event, &sample, file_offset); if (ret != -ETIME) return ret; } - return perf_session_deliver_event(session, event, &sample, ops, + return perf_session_deliver_event(session, event, &sample, tool, file_offset); } @@ -884,6 +915,11 @@ void perf_event_header__bswap(struct perf_event_header *self) self->size = bswap_16(self->size); } +struct thread *perf_session__findnew(struct perf_session *session, pid_t pid) +{ + return machine__findnew_thread(&session->host_machine, pid); +} + static struct thread *perf_session__register_idle_thread(struct perf_session *self) { struct thread *thread = perf_session__findnew(self, 0); @@ -897,9 +933,9 @@ static struct thread *perf_session__register_idle_thread(struct perf_session *se } static void perf_session__warn_about_errors(const struct perf_session *session, - const struct perf_event_ops *ops) + const struct perf_tool *tool) { - if (ops->lost == perf_event__process_lost && + if (tool->lost == perf_event__process_lost && session->hists.stats.nr_events[PERF_RECORD_LOST] != 0) { ui__warning("Processed %d events and lost %d chunks!\n\n" "Check IO/CPU overload!\n\n", @@ -934,7 +970,7 @@ static void perf_session__warn_about_errors(const struct perf_session *session, volatile int session_done; static int __perf_session__process_pipe_events(struct perf_session *self, - struct perf_event_ops *ops) + struct perf_tool *tool) { union perf_event event; uint32_t size; @@ -943,7 +979,7 @@ static int __perf_session__process_pipe_events(struct perf_session *self, int err; void *p; - perf_event_ops__fill_defaults(ops); + perf_tool__fill_defaults(tool); head = 0; more: @@ -979,8 +1015,7 @@ more: } } - if (size == 0 || - (skip = perf_session__process_event(self, &event, ops, head)) < 0) { + if ((skip = perf_session__process_event(self, &event, tool, head)) < 0) { dump_printf("%#" PRIx64 " [%#x]: skipping unknown header type: %d\n", head, event.header.size, event.header.type); /* @@ -1003,7 +1038,7 @@ more: done: err = 0; out_err: - perf_session__warn_about_errors(self, ops); + perf_session__warn_about_errors(self, tool); perf_session_free_sample_buffers(self); return err; } @@ -1034,7 +1069,7 @@ fetch_mmaped_event(struct perf_session *session, int __perf_session__process_events(struct perf_session *session, u64 data_offset, u64 data_size, - u64 file_size, struct perf_event_ops *ops) + u64 file_size, struct perf_tool *tool) { u64 head, page_offset, file_offset, file_pos, progress_next; int err, mmap_prot, mmap_flags, map_idx = 0; @@ -1043,7 +1078,7 @@ int __perf_session__process_events(struct perf_session *session, union perf_event *event; uint32_t size; - perf_event_ops__fill_defaults(ops); + perf_tool__fill_defaults(tool); page_size = sysconf(_SC_PAGESIZE); @@ -1098,7 +1133,7 @@ more: size = event->header.size; if (size == 0 || - perf_session__process_event(session, event, ops, file_pos) < 0) { + perf_session__process_event(session, event, tool, file_pos) < 0) { dump_printf("%#" PRIx64 " [%#x]: skipping unknown header type: %d\n", file_offset + head, event->header.size, event->header.type); @@ -1127,15 +1162,15 @@ more: err = 0; /* do the final flush for ordered samples */ session->ordered_samples.next_flush = ULLONG_MAX; - flush_sample_queue(session, ops); + flush_sample_queue(session, tool); out_err: - perf_session__warn_about_errors(session, ops); + perf_session__warn_about_errors(session, tool); perf_session_free_sample_buffers(session); return err; } int perf_session__process_events(struct perf_session *self, - struct perf_event_ops *ops) + struct perf_tool *tool) { int err; @@ -1146,9 +1181,9 @@ int perf_session__process_events(struct perf_session *self, err = __perf_session__process_events(self, self->header.data_offset, self->header.data_size, - self->size, ops); + self->size, tool); else - err = __perf_session__process_pipe_events(self, ops); + err = __perf_session__process_pipe_events(self, tool); return err; } @@ -1163,9 +1198,8 @@ bool perf_session__has_traces(struct perf_session *self, const char *msg) return true; } -int perf_session__set_kallsyms_ref_reloc_sym(struct map **maps, - const char *symbol_name, - u64 addr) +int maps__set_kallsyms_ref_reloc_sym(struct map **maps, + const char *symbol_name, u64 addr) { char *bracket; enum map_type i; @@ -1224,6 +1258,27 @@ size_t perf_session__fprintf_nr_events(struct perf_session *session, FILE *fp) return ret; } +size_t perf_session__fprintf(struct perf_session *session, FILE *fp) +{ + /* + * FIXME: Here we have to actually print all the machines in this + * session, not just the host... + */ + return machine__fprintf(&session->host_machine, fp); +} + +void perf_session__remove_thread(struct perf_session *session, + struct thread *th) +{ + /* + * FIXME: This one makes no sense, we need to remove the thread from + * the machine it belongs to, perf_session can have many machines, so + * doing it always on ->host_machine is wrong. Fix when auditing all + * the 'perf kvm' code. + */ + machine__remove_thread(&session->host_machine, th); +} + struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, unsigned int type) { @@ -1236,17 +1291,16 @@ struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, return NULL; } -void perf_session__print_ip(union perf_event *event, - struct perf_sample *sample, - struct perf_session *session, - int print_sym, int print_dso) +void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, + struct machine *machine, struct perf_evsel *evsel, + int print_sym, int print_dso) { struct addr_location al; const char *symname, *dsoname; - struct callchain_cursor *cursor = &session->callchain_cursor; + struct callchain_cursor *cursor = &evsel->hists.callchain_cursor; struct callchain_cursor_node *node; - if (perf_event__preprocess_sample(event, session, &al, sample, + if (perf_event__preprocess_sample(event, machine, &al, sample, NULL) < 0) { error("problem processing %d event, skipping it.\n", event->header.type); @@ -1255,7 +1309,7 @@ void perf_session__print_ip(union perf_event *event, if (symbol_conf.use_callchain && sample->callchain) { - if (perf_session__resolve_callchain(session, al.thread, + if (machine__resolve_callchain(machine, evsel, al.thread, sample->callchain, NULL) != 0) { if (verbose) error("Failed to resolve callchain. Skipping\n"); diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h index 6e393c98eb3..37bc38381fb 100644 --- a/tools/perf/util/session.h +++ b/tools/perf/util/session.h @@ -30,9 +30,6 @@ struct perf_session { struct perf_header header; unsigned long size; unsigned long mmap_window; - struct rb_root threads; - struct list_head dead_threads; - struct thread *last_match; struct machine host_machine; struct rb_root machines; struct perf_evlist *evlist; @@ -53,65 +50,31 @@ struct perf_session { int cwdlen; char *cwd; struct ordered_samples ordered_samples; - struct callchain_cursor callchain_cursor; - char filename[0]; + char filename[1]; }; -struct perf_evsel; -struct perf_event_ops; - -typedef int (*event_sample)(union perf_event *event, struct perf_sample *sample, - struct perf_evsel *evsel, struct perf_session *session); -typedef int (*event_op)(union perf_event *self, struct perf_sample *sample, - struct perf_session *session); -typedef int (*event_synth_op)(union perf_event *self, - struct perf_session *session); -typedef int (*event_op2)(union perf_event *self, struct perf_session *session, - struct perf_event_ops *ops); - -struct perf_event_ops { - event_sample sample; - event_op mmap, - comm, - fork, - exit, - lost, - read, - throttle, - unthrottle; - event_synth_op attr, - event_type, - tracing_data, - build_id; - event_op2 finished_round; - bool ordered_samples; - bool ordering_requires_timestamps; -}; +struct perf_tool; struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe, - struct perf_event_ops *ops); + struct perf_tool *tool); void perf_session__delete(struct perf_session *self); void perf_event_header__bswap(struct perf_event_header *self); int __perf_session__process_events(struct perf_session *self, u64 data_offset, u64 data_size, u64 size, - struct perf_event_ops *ops); + struct perf_tool *tool); int perf_session__process_events(struct perf_session *self, - struct perf_event_ops *event_ops); + struct perf_tool *tool); -int perf_session__resolve_callchain(struct perf_session *self, +int perf_session__resolve_callchain(struct perf_session *self, struct perf_evsel *evsel, struct thread *thread, struct ip_callchain *chain, struct symbol **parent); bool perf_session__has_traces(struct perf_session *self, const char *msg); -int perf_session__set_kallsyms_ref_reloc_sym(struct map **maps, - const char *symbol_name, - u64 addr); - void mem_bswap_64(void *src, int byte_size); void perf_event__attr_swap(struct perf_event_attr *attr); @@ -144,12 +107,16 @@ struct machine *perf_session__findnew_machine(struct perf_session *self, pid_t p static inline void perf_session__process_machines(struct perf_session *self, + struct perf_tool *tool, machine__process_t process) { - process(&self->host_machine, self); - return machines__process(&self->machines, process, self); + process(&self->host_machine, tool); + return machines__process(&self->machines, process, tool); } +struct thread *perf_session__findnew(struct perf_session *self, pid_t pid); +size_t perf_session__fprintf(struct perf_session *self, FILE *fp); + size_t perf_session__fprintf_dsos(struct perf_session *self, FILE *fp); size_t perf_session__fprintf_dsos_buildid(struct perf_session *self, @@ -167,13 +134,20 @@ static inline int perf_session__parse_sample(struct perf_session *session, session->header.needs_swap); } +static inline int perf_session__synthesize_sample(struct perf_session *session, + union perf_event *event, + const struct perf_sample *sample) +{ + return perf_event__synthesize_sample(event, session->sample_type, + sample, session->header.needs_swap); +} + struct perf_evsel *perf_session__find_first_evtype(struct perf_session *session, unsigned int type); -void perf_session__print_ip(union perf_event *event, - struct perf_sample *sample, - struct perf_session *session, - int print_sym, int print_dso); +void perf_event__print_ip(union perf_event *event, struct perf_sample *sample, + struct machine *machine, struct perf_evsel *evsel, + int print_sym, int print_dso); int perf_session__cpu_bitmap(struct perf_session *session, const char *cpu_list, unsigned long *cpu_bitmap); diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py index 95d37007492..36d4c561957 100644 --- a/tools/perf/util/setup.py +++ b/tools/perf/util/setup.py @@ -27,7 +27,8 @@ build_tmp = getenv('PYTHON_EXTBUILD_TMP') perf = Extension('perf', sources = ['util/python.c', 'util/ctype.c', 'util/evlist.c', 'util/evsel.c', 'util/cpumap.c', 'util/thread_map.c', - 'util/util.c', 'util/xyarray.c', 'util/cgroup.c'], + 'util/util.c', 'util/xyarray.c', 'util/cgroup.c', + 'util/debugfs.c'], include_dirs = ['util/include'], extra_compile_args = cflags, ) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 632b50c7bc2..215d50f2042 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -1757,7 +1757,7 @@ static int map_groups__set_modules_path_dir(struct map_groups *mg, struct stat st; /*sshfs might return bad dent->d_type, so we have to stat*/ - sprintf(path, "%s/%s", dir_name, dent->d_name); + snprintf(path, sizeof(path), "%s/%s", dir_name, dent->d_name); if (stat(path, &st)) continue; @@ -1766,8 +1766,6 @@ static int map_groups__set_modules_path_dir(struct map_groups *mg, !strcmp(dent->d_name, "..")) continue; - snprintf(path, sizeof(path), "%s/%s", - dir_name, dent->d_name); ret = map_groups__set_modules_path_dir(mg, path); if (ret < 0) goto out; @@ -1788,9 +1786,6 @@ static int map_groups__set_modules_path_dir(struct map_groups *mg, if (map == NULL) continue; - snprintf(path, sizeof(path), "%s/%s", - dir_name, dent->d_name); - long_name = strdup(path); if (long_name == NULL) { ret = -1; @@ -2609,10 +2604,10 @@ int symbol__init(void) symbol_conf.initialized = true; return 0; -out_free_dso_list: - strlist__delete(symbol_conf.dso_list); out_free_comm_list: strlist__delete(symbol_conf.comm_list); +out_free_dso_list: + strlist__delete(symbol_conf.dso_list); return -1; } diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 29f8d742e92..123c2e14353 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -68,6 +68,7 @@ struct strlist; struct symbol_conf { unsigned short priv_size; + unsigned short nr_events; bool try_vmlinux_path, use_modules, sort_by_name, diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c index d5d3b22250f..fb4b7ea6752 100644 --- a/tools/perf/util/thread.c +++ b/tools/perf/util/thread.c @@ -61,7 +61,7 @@ static size_t thread__fprintf(struct thread *self, FILE *fp) map_groups__fprintf(&self->mg, verbose, fp); } -struct thread *perf_session__findnew(struct perf_session *self, pid_t pid) +struct thread *machine__findnew_thread(struct machine *self, pid_t pid) { struct rb_node **p = &self->threads.rb_node; struct rb_node *parent = NULL; @@ -125,12 +125,12 @@ int thread__fork(struct thread *self, struct thread *parent) return 0; } -size_t perf_session__fprintf(struct perf_session *self, FILE *fp) +size_t machine__fprintf(struct machine *machine, FILE *fp) { size_t ret = 0; struct rb_node *nd; - for (nd = rb_first(&self->threads); nd; nd = rb_next(nd)) { + for (nd = rb_first(&machine->threads); nd; nd = rb_next(nd)) { struct thread *pos = rb_entry(nd, struct thread, rb_node); ret += thread__fprintf(pos, fp); diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h index e5f2401c1b5..70c2c13ff67 100644 --- a/tools/perf/util/thread.h +++ b/tools/perf/util/thread.h @@ -18,16 +18,14 @@ struct thread { int comm_len; }; -struct perf_session; +struct machine; void thread__delete(struct thread *self); int thread__set_comm(struct thread *self, const char *comm); int thread__comm_len(struct thread *self); -struct thread *perf_session__findnew(struct perf_session *self, pid_t pid); void thread__insert_map(struct thread *self, struct map *map); int thread__fork(struct thread *self, struct thread *parent); -size_t perf_session__fprintf(struct perf_session *self, FILE *fp); static inline struct map *thread__find_map(struct thread *self, enum map_type type, u64 addr) @@ -35,14 +33,12 @@ static inline struct map *thread__find_map(struct thread *self, return self ? map_groups__find(&self->mg, type, addr) : NULL; } -void thread__find_addr_map(struct thread *self, - struct perf_session *session, u8 cpumode, - enum map_type type, pid_t pid, u64 addr, +void thread__find_addr_map(struct thread *thread, struct machine *machine, + u8 cpumode, enum map_type type, u64 addr, struct addr_location *al); -void thread__find_addr_location(struct thread *self, - struct perf_session *session, u8 cpumode, - enum map_type type, pid_t pid, u64 addr, +void thread__find_addr_location(struct thread *thread, struct machine *machine, + u8 cpumode, enum map_type type, u64 addr, struct addr_location *al, symbol_filter_t filter); #endif /* __PERF_THREAD_H */ diff --git a/tools/perf/util/tool.h b/tools/perf/util/tool.h new file mode 100644 index 00000000000..b0e1aadba8d --- /dev/null +++ b/tools/perf/util/tool.h @@ -0,0 +1,50 @@ +#ifndef __PERF_TOOL_H +#define __PERF_TOOL_H + +#include <stdbool.h> + +struct perf_session; +union perf_event; +struct perf_evlist; +struct perf_evsel; +struct perf_sample; +struct perf_tool; +struct machine; + +typedef int (*event_sample)(struct perf_tool *tool, union perf_event *event, + struct perf_sample *sample, + struct perf_evsel *evsel, struct machine *machine); + +typedef int (*event_op)(struct perf_tool *tool, union perf_event *event, + struct perf_sample *sample, struct machine *machine); + +typedef int (*event_attr_op)(union perf_event *event, + struct perf_evlist **pevlist); +typedef int (*event_simple_op)(struct perf_tool *tool, union perf_event *event); + +typedef int (*event_synth_op)(union perf_event *event, + struct perf_session *session); + +typedef int (*event_op2)(struct perf_tool *tool, union perf_event *event, + struct perf_session *session); + +struct perf_tool { + event_sample sample, + read; + event_op mmap, + comm, + fork, + exit, + lost, + throttle, + unthrottle; + event_attr_op attr; + event_synth_op tracing_data; + event_simple_op event_type; + event_op2 finished_round, + build_id; + bool ordered_samples; + bool ordering_requires_timestamps; +}; + +#endif /* __PERF_TOOL_H */ diff --git a/tools/perf/util/top.h b/tools/perf/util/top.h index 39965096795..a248f3c2c60 100644 --- a/tools/perf/util/top.h +++ b/tools/perf/util/top.h @@ -1,15 +1,17 @@ #ifndef __PERF_TOP_H #define __PERF_TOP_H 1 +#include "tool.h" #include "types.h" -#include "../perf.h" #include <stddef.h> +#include <stdbool.h> struct perf_evlist; struct perf_evsel; struct perf_session; struct perf_top { + struct perf_tool tool; struct perf_evlist *evlist; /* * Symbols will be added here in perf_event__process_sample and will @@ -23,10 +25,26 @@ struct perf_top { int freq; pid_t target_pid, target_tid; bool hide_kernel_symbols, hide_user_symbols, zero; + bool system_wide; + bool use_tui, use_stdio; + bool sort_has_symbols; + bool dont_use_callchains; + bool kptr_restrict_warned; + bool vmlinux_warned; + bool inherit; + bool group; + bool sample_id_all_avail; + bool dump_symtab; const char *cpu_list; struct hist_entry *sym_filter_entry; struct perf_evsel *sym_evsel; struct perf_session *session; + struct winsize winsize; + unsigned int mmap_pages; + int default_interval; + int realtime_prio; + int sym_pcnt_filter; + const char *sym_filter; }; size_t perf_top__header_snprintf(struct perf_top *top, char *bf, size_t size); diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index d2655f08bcc..ac6830d8292 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -18,7 +18,8 @@ * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -#define _GNU_SOURCE +#include <ctype.h> +#include "util.h" #include <dirent.h> #include <mntent.h> #include <stdio.h> @@ -31,7 +32,6 @@ #include <pthread.h> #include <fcntl.h> #include <unistd.h> -#include <ctype.h> #include <errno.h> #include <stdbool.h> #include <linux/list.h> @@ -44,10 +44,6 @@ #define VERSION "0.5" -#define _STR(x) #x -#define STR(x) _STR(x) -#define MAX_PATH 256 - #define TRACE_CTRL "tracing_on" #define TRACE "trace" #define AVAILABLE "available_tracers" @@ -73,26 +69,6 @@ struct events { }; - -static void die(const char *fmt, ...) -{ - va_list ap; - int ret = errno; - - if (errno) - perror("perf"); - else - ret = -1; - - va_start(ap, fmt); - fprintf(stderr, " "); - vfprintf(stderr, fmt, ap); - va_end(ap); - - fprintf(stderr, "\n"); - exit(ret); -} - void *malloc_or_die(unsigned int size) { void *data; diff --git a/tools/perf/util/trace-event-scripting.c b/tools/perf/util/trace-event-scripting.c index c9dcbec7d80..a3fdf55f317 100644 --- a/tools/perf/util/trace-event-scripting.c +++ b/tools/perf/util/trace-event-scripting.c @@ -39,7 +39,7 @@ static int stop_script_unsupported(void) static void process_event_unsupported(union perf_event *event __unused, struct perf_sample *sample __unused, struct perf_evsel *evsel __unused, - struct perf_session *session __unused, + struct machine *machine __unused, struct thread *thread __unused) { } diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index a8410081764..58ae14c5baa 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -3,7 +3,11 @@ #include <stdbool.h> #include "parse-events.h" -#include "session.h" + +struct machine; +struct perf_sample; +union perf_event; +struct thread; #define __unused __attribute__((unused)) @@ -292,7 +296,7 @@ struct scripting_ops { void (*process_event) (union perf_event *event, struct perf_sample *sample, struct perf_evsel *evsel, - struct perf_session *session, + struct machine *machine, struct thread *thread); int (*generate_script) (const char *outfile); }; diff --git a/tools/perf/util/ui/browsers/annotate.c b/tools/perf/util/ui/browsers/annotate.c index 0575905d120..295a9c93f94 100644 --- a/tools/perf/util/ui/browsers/annotate.c +++ b/tools/perf/util/ui/browsers/annotate.c @@ -224,7 +224,7 @@ static bool annotate_browser__toggle_source(struct annotate_browser *browser) } static int annotate_browser__run(struct annotate_browser *self, int evidx, - int nr_events, void(*timer)(void *arg), + void(*timer)(void *arg), void *arg, int delay_secs) { struct rb_node *nd = NULL; @@ -328,8 +328,7 @@ static int annotate_browser__run(struct annotate_browser *self, int evidx, notes = symbol__annotation(target); pthread_mutex_lock(¬es->lock); - if (notes->src == NULL && - symbol__alloc_hist(target, nr_events) < 0) { + if (notes->src == NULL && symbol__alloc_hist(target) < 0) { pthread_mutex_unlock(¬es->lock); ui__warning("Not enough memory for annotating '%s' symbol!\n", target->name); @@ -337,7 +336,7 @@ static int annotate_browser__run(struct annotate_browser *self, int evidx, } pthread_mutex_unlock(¬es->lock); - symbol__tui_annotate(target, ms->map, evidx, nr_events, + symbol__tui_annotate(target, ms->map, evidx, timer, arg, delay_secs); } continue; @@ -358,15 +357,15 @@ out: return key; } -int hist_entry__tui_annotate(struct hist_entry *he, int evidx, int nr_events, +int hist_entry__tui_annotate(struct hist_entry *he, int evidx, void(*timer)(void *arg), void *arg, int delay_secs) { - return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx, nr_events, + return symbol__tui_annotate(he->ms.sym, he->ms.map, evidx, timer, arg, delay_secs); } int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx, - int nr_events, void(*timer)(void *arg), void *arg, + void(*timer)(void *arg), void *arg, int delay_secs) { struct objdump_line *pos, *n; @@ -419,8 +418,7 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, int evidx, browser.b.nr_entries = browser.nr_entries; browser.b.entries = ¬es->src->source, browser.b.width += 18; /* Percentage */ - ret = annotate_browser__run(&browser, evidx, nr_events, - timer, arg, delay_secs); + ret = annotate_browser__run(&browser, evidx, timer, arg, delay_secs); list_for_each_entry_safe(pos, n, ¬es->src->source, node) { list_del(&pos->node); objdump_line__free(pos); diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c index d0c94b45968..1212a386a03 100644 --- a/tools/perf/util/ui/browsers/hists.c +++ b/tools/perf/util/ui/browsers/hists.c @@ -1020,7 +1020,7 @@ do_annotate: * Don't let this be freed, say, by hists__decay_entry. */ he->used = true; - err = hist_entry__tui_annotate(he, evsel->idx, nr_events, + err = hist_entry__tui_annotate(he, evsel->idx, timer, arg, delay_secs); he->used = false; ui_browser__update_nr_entries(&browser->b, browser->hists->nr_entries); diff --git a/tools/perf/util/ui/progress.c b/tools/perf/util/ui/progress.c index 295e366b631..13aa64e50e1 100644 --- a/tools/perf/util/ui/progress.c +++ b/tools/perf/util/ui/progress.c @@ -14,6 +14,9 @@ void ui_progress__update(u64 curr, u64 total, const char *title) if (use_browser <= 0) return; + if (total == 0) + return; + ui__refresh_dimensions(true); pthread_mutex_lock(&ui__lock); y = SLtt_Screen_Rows / 2 - 2; diff --git a/tools/perf/util/usage.c b/tools/perf/util/usage.c index e16bf9a707e..d76d1c0ff98 100644 --- a/tools/perf/util/usage.c +++ b/tools/perf/util/usage.c @@ -1,5 +1,8 @@ /* - * GIT - The information manager from hell + * usage.c + * + * Various reporting routines. + * Originally copied from GIT source. * * Copyright (C) Linus Torvalds, 2005 */ diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h index 0128906bac8..37be34dff79 100644 --- a/tools/perf/util/util.h +++ b/tools/perf/util/util.h @@ -245,4 +245,15 @@ int readn(int fd, void *buf, size_t size); #define _STR(x) #x #define STR(x) _STR(x) +/* + * Determine whether some value is a power of two, where zero is + * *not* considered a power of two. + */ + +static inline __attribute__((const)) +bool is_power_of_2(unsigned long n) +{ + return (n != 0 && ((n & (n - 1)) == 0)); +} + #endif diff --git a/tools/perf/util/values.c b/tools/perf/util/values.c index bdd33470b23..697c8b4e59c 100644 --- a/tools/perf/util/values.c +++ b/tools/perf/util/values.c @@ -32,6 +32,7 @@ void perf_read_values_destroy(struct perf_read_values *values) for (i = 0; i < values->threads; i++) free(values->value[i]); + free(values->value); free(values->pid); free(values->tid); free(values->counterrawid); |