diff options
171 files changed, 2270 insertions, 984 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index dd10b51b4e6..5405f7aecef 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -32,8 +32,6 @@ DocBook/ - directory with DocBook templates etc. for kernel documentation. HOWTO - the process and procedures of how to do Linux kernel development. -IO-mapping.txt - - how to access I/O mapped memory from within device drivers. IPMI.txt - info on Linux Intelligent Platform Management Interface (IPMI) Driver. IRQ-affinity.txt @@ -84,6 +82,8 @@ blockdev/ - info on block devices & drivers btmrvl.txt - info on Marvell Bluetooth driver usage. +bus-virt-phys-mapping.txt + - how to access I/O mapped memory from within device drivers. cachetlb.txt - describes the cache/TLB flushing interfaces Linux uses. cdrom/ @@ -168,6 +168,8 @@ initrd.txt - how to use the RAM disk as an initial/temporary root filesystem. input/ - info on Linux input device support. +io-mapping.txt + - description of io_mapping functions in linux/io-mapping.h io_ordering.txt - info on ordering I/O writes to memory-mapped addresses. ioctl/ diff --git a/Documentation/IO-mapping.txt b/Documentation/bus-virt-phys-mapping.txt index 1b5aa10df84..1b5aa10df84 100644 --- a/Documentation/IO-mapping.txt +++ b/Documentation/bus-virt-phys-mapping.txt diff --git a/MAINTAINERS b/MAINTAINERS index 58848125b8b..db3d0f5061f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5336,6 +5336,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-2.6.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-next-2.6.git S: Maintained F: arch/sparc/ +F: drivers/sbus SPARC SERIAL DRIVERS M: "David S. Miller" <davem@davemloft.net> diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index 9877372ffdb..5beb97bafbb 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -82,7 +82,7 @@ unsigned long ftrace_return_to_handler(unsigned long retval0, unsigned long ret; pop_return_trace(&trace, &ret); - trace.rettime = cpu_clock(raw_smp_processor_id()); + trace.rettime = local_clock(); ftrace_graph_return(&trace); if (unlikely(!ret)) { @@ -126,7 +126,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - calltime = cpu_clock(raw_smp_processor_id()); + calltime = local_clock(); if (push_return_trace(old, calltime, self_addr, &trace.depth) == -EBUSY) { diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index b0b21134f61..4b611ca1a76 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -197,6 +197,7 @@ extern const char *powerpc_base_platform; #define CPU_FTR_SAO LONG_ASM_CONST(0x0020000000000000) #define CPU_FTR_CP_USE_DCBTZ LONG_ASM_CONST(0x0040000000000000) #define CPU_FTR_UNALIGNED_LD_STD LONG_ASM_CONST(0x0080000000000000) +#define CPU_FTR_ASYM_SMT LONG_ASM_CONST(0x0100000000000000) #ifndef __ASSEMBLY__ @@ -412,7 +413,7 @@ extern const char *powerpc_base_platform; CPU_FTR_MMCRA | CPU_FTR_SMT | \ CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \ CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ - CPU_FTR_DSCR | CPU_FTR_SAO) + CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT) #define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 773424df828..43855c9f84d 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1263,3 +1263,14 @@ unsigned long randomize_et_dyn(unsigned long base) return ret; } + +#ifdef CONFIG_SMP +int arch_sd_sibling_asym_packing(void) +{ + if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { + printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); + return SD_ASYM_PACKING; + } + return 0; +} +#endif diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig index 259e3fd5099..1dc07a0014c 100644 --- a/arch/sparc/configs/sparc64_defconfig +++ b/arch/sparc/configs/sparc64_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.34-rc3 -# Sat Apr 3 15:49:56 2010 +# Linux kernel version: 2.6.34 +# Wed May 26 21:14:01 2010 # CONFIG_64BIT=y CONFIG_SPARC=y @@ -107,10 +107,9 @@ CONFIG_PERF_COUNTERS=y # CONFIG_DEBUG_PERF_USE_VMALLOC is not set CONFIG_VM_EVENT_COUNTERS=y CONFIG_PCI_QUIRKS=y -CONFIG_SLUB_DEBUG=y # CONFIG_COMPAT_BRK is not set -# CONFIG_SLAB is not set -CONFIG_SLUB=y +CONFIG_SLAB=y +# CONFIG_SLUB is not set # CONFIG_SLOB is not set CONFIG_PROFILING=y CONFIG_TRACEPOINTS=y @@ -239,6 +238,7 @@ CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y CONFIG_SPARSEMEM_VMEMMAP=y CONFIG_PAGEFLAGS_EXTENDED=y CONFIG_SPLIT_PTLOCK_CPUS=4 +# CONFIG_COMPACTION is not set CONFIG_MIGRATION=y CONFIG_PHYS_ADDR_T_64BIT=y CONFIG_ZONE_DMA_FLAG=0 @@ -351,6 +351,7 @@ CONFIG_IPV6_TUNNEL=m # CONFIG_RDS is not set # CONFIG_TIPC is not set # CONFIG_ATM is not set +# CONFIG_L2TP is not set # CONFIG_BRIDGE is not set # CONFIG_NET_DSA is not set CONFIG_VLAN_8021Q=m @@ -367,6 +368,7 @@ CONFIG_VLAN_8021Q=m # CONFIG_IEEE802154 is not set # CONFIG_NET_SCHED is not set # CONFIG_DCB is not set +CONFIG_RPS=y # # Network testing @@ -386,9 +388,14 @@ CONFIG_WIRELESS=y # # CFG80211 needs to be enabled for MAC80211 # + +# +# Some wireless drivers require a rate control algorithm +# # CONFIG_WIMAX is not set # CONFIG_RFKILL is not set # CONFIG_NET_9P is not set +# CONFIG_CAIF is not set # # Device Drivers @@ -658,6 +665,7 @@ CONFIG_PHYLIB=m # CONFIG_NATIONAL_PHY is not set # CONFIG_STE10XP is not set # CONFIG_LSI_ET1011C_PHY is not set +# CONFIG_MICREL_PHY is not set # CONFIG_MDIO_BITBANG is not set CONFIG_NET_ETHERNET=y CONFIG_MII=m @@ -734,6 +742,8 @@ CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_CHELSIO_T3 is not set +CONFIG_CHELSIO_T4_DEPENDS=y +# CONFIG_CHELSIO_T4 is not set # CONFIG_ENIC is not set # CONFIG_IXGBE is not set # CONFIG_IXGBEVF is not set @@ -766,6 +776,7 @@ CONFIG_NIU=m # CONFIG_USB_PEGASUS is not set # CONFIG_USB_RTL8150 is not set # CONFIG_USB_USBNET is not set +# CONFIG_USB_IPHETH is not set # CONFIG_WAN is not set # CONFIG_FDDI is not set # CONFIG_HIPPI is not set @@ -778,7 +789,6 @@ CONFIG_PPP_DEFLATE=m CONFIG_PPP_BSDCOMP=m CONFIG_PPP_MPPE=m CONFIG_PPPOE=m -# CONFIG_PPPOL2TP is not set # CONFIG_SLIP is not set CONFIG_SLHC=m # CONFIG_NET_FC is not set @@ -816,6 +826,7 @@ CONFIG_INPUT_KEYBOARD=y CONFIG_KEYBOARD_ATKBD=y # CONFIG_QT2160 is not set CONFIG_KEYBOARD_LKKBD=m +# CONFIG_KEYBOARD_TCA6416 is not set # CONFIG_KEYBOARD_MAX7359 is not set # CONFIG_KEYBOARD_NEWTON is not set # CONFIG_KEYBOARD_OPENCORES is not set @@ -840,6 +851,7 @@ CONFIG_MOUSE_SERIAL=y # CONFIG_INPUT_TABLET is not set # CONFIG_INPUT_TOUCHSCREEN is not set CONFIG_INPUT_MISC=y +# CONFIG_INPUT_AD714X is not set CONFIG_INPUT_SPARCSPKR=y # CONFIG_INPUT_ATI_REMOTE is not set # CONFIG_INPUT_ATI_REMOTE2 is not set @@ -848,6 +860,7 @@ CONFIG_INPUT_SPARCSPKR=y # CONFIG_INPUT_YEALINK is not set # CONFIG_INPUT_CM109 is not set # CONFIG_INPUT_UINPUT is not set +# CONFIG_INPUT_PCF8574 is not set # # Hardware I/O ports @@ -871,6 +884,7 @@ CONFIG_HW_CONSOLE=y # CONFIG_VT_HW_CONSOLE_BINDING is not set # CONFIG_DEVKMEM is not set # CONFIG_SERIAL_NONSTANDARD is not set +# CONFIG_N_GSM is not set # CONFIG_NOZOMI is not set # @@ -893,6 +907,8 @@ CONFIG_SERIAL_CORE_CONSOLE=y # CONFIG_SERIAL_JSM is not set # CONFIG_SERIAL_TIMBERDALE is not set # CONFIG_SERIAL_GRLIB_GAISLER_APBUART is not set +# CONFIG_SERIAL_ALTERA_JTAGUART is not set +# CONFIG_SERIAL_ALTERA_UART is not set CONFIG_UNIX98_PTYS=y # CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set # CONFIG_LEGACY_PTYS is not set @@ -1306,11 +1322,14 @@ CONFIG_USB_HIDDEV=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y +# CONFIG_HID_CANDO is not set CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y +# CONFIG_HID_PRODIKEYS is not set CONFIG_HID_CYPRESS=y CONFIG_HID_DRAGONRISE=y # CONFIG_DRAGONRISE_FF is not set +# CONFIG_HID_EGALAX is not set CONFIG_HID_EZKEY=y CONFIG_HID_KYE=y CONFIG_HID_GYRATION=y @@ -1328,7 +1347,9 @@ CONFIG_HID_ORTEK=y CONFIG_HID_PANTHERLORD=y # CONFIG_PANTHERLORD_FF is not set CONFIG_HID_PETALYNX=y +# CONFIG_HID_PICOLCD is not set # CONFIG_HID_QUANTA is not set +# CONFIG_HID_ROCCAT_KONE is not set CONFIG_HID_SAMSUNG=y CONFIG_HID_SONY=y # CONFIG_HID_STANTUM is not set @@ -1342,6 +1363,7 @@ CONFIG_HID_THRUSTMASTER=y # CONFIG_THRUSTMASTER_FF is not set CONFIG_HID_ZEROPLUS=y # CONFIG_ZEROPLUS_FF is not set +# CONFIG_HID_ZYDACRON is not set CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y @@ -1356,7 +1378,6 @@ CONFIG_USB=y # CONFIG_USB_DEVICEFS is not set # CONFIG_USB_DEVICE_CLASS is not set # CONFIG_USB_DYNAMIC_MINORS is not set -# CONFIG_USB_OTG is not set # CONFIG_USB_MON is not set # CONFIG_USB_WUSB is not set # CONFIG_USB_WUSB_CBAF is not set @@ -1521,10 +1542,6 @@ CONFIG_RTC_DRV_STARFIRE=y # CONFIG_DMADEVICES is not set # CONFIG_AUXDISPLAY is not set # CONFIG_UIO is not set - -# -# TI VLYNQ -# # CONFIG_STAGING is not set # @@ -1706,8 +1723,8 @@ CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 CONFIG_SCHEDSTATS=y # CONFIG_TIMER_STATS is not set # CONFIG_DEBUG_OBJECTS is not set -# CONFIG_SLUB_DEBUG_ON is not set -# CONFIG_SLUB_STATS is not set +# CONFIG_DEBUG_SLAB is not set +# CONFIG_DEBUG_KMEMLEAK is not set # CONFIG_DEBUG_RT_MUTEXES is not set # CONFIG_RT_MUTEX_TESTER is not set # CONFIG_DEBUG_SPINLOCK is not set @@ -1742,6 +1759,9 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y # CONFIG_DEBUG_PAGEALLOC is not set CONFIG_NOP_TRACER=y CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST=y +CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_SYSCALL_TRACEPOINTS=y @@ -1769,12 +1789,12 @@ CONFIG_BLK_DEV_IO_TRACE=y # CONFIG_RING_BUFFER_BENCHMARK is not set # CONFIG_DYNAMIC_DEBUG is not set # CONFIG_DMA_API_DEBUG is not set +# CONFIG_ATOMIC64_SELFTEST is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set # CONFIG_DEBUG_STACK_USAGE is not set # CONFIG_DEBUG_DCFLUSH is not set -# CONFIG_STACK_DEBUG is not set # CONFIG_DEBUG_STRICT_USER_COPY_CHECKS is not set # @@ -1895,6 +1915,7 @@ CONFIG_CRYPTO_DEFLATE=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +# CONFIG_CRYPTO_DEV_NIAGARA2 is not set # CONFIG_CRYPTO_DEV_HIFN_795X is not set CONFIG_BINARY_PRINTF=y diff --git a/arch/sparc/include/asm/cache.h b/arch/sparc/include/asm/cache.h index 0588b8c7faa..69358b590c9 100644 --- a/arch/sparc/include/asm/cache.h +++ b/arch/sparc/include/asm/cache.h @@ -11,7 +11,6 @@ #define L1_CACHE_SHIFT 5 #define L1_CACHE_BYTES 32 -#define L1_CACHE_ALIGN(x) ((((x)+(L1_CACHE_BYTES-1))&~(L1_CACHE_BYTES-1))) #ifdef CONFIG_SPARC32 #define SMP_CACHE_BYTES_SHIFT 5 diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index 77f906d8cc2..0ece77f4775 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -142,13 +142,12 @@ BTFIXUPDEF_CALL_CONST(unsigned long, pgd_page_vaddr, pgd_t) #define pmd_page(pmd) BTFIXUP_CALL(pmd_page)(pmd) #define pgd_page_vaddr(pgd) BTFIXUP_CALL(pgd_page_vaddr)(pgd) -BTFIXUPDEF_SETHI(none_mask) BTFIXUPDEF_CALL_CONST(int, pte_present, pte_t) BTFIXUPDEF_CALL(void, pte_clear, pte_t *) static inline int pte_none(pte_t pte) { - return !(pte_val(pte) & ~BTFIXUP_SETHI(none_mask)); + return !pte_val(pte); } #define pte_present(pte) BTFIXUP_CALL(pte_present)(pte) @@ -160,7 +159,7 @@ BTFIXUPDEF_CALL(void, pmd_clear, pmd_t *) static inline int pmd_none(pmd_t pmd) { - return !(pmd_val(pmd) & ~BTFIXUP_SETHI(none_mask)); + return !pmd_val(pmd); } #define pmd_bad(pmd) BTFIXUP_CALL(pmd_bad)(pmd) diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 0ec92c8861d..44faabc3c02 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -657,6 +657,7 @@ static u64 maybe_change_configuration(struct cpu_hw_events *cpuc, u64 pcr) cpuc->current_idx[i] = idx; enc = perf_event_get_enc(cpuc->events[i]); + pcr &= ~mask_for_index(idx); pcr |= event_encoding(enc, idx); } out: diff --git a/arch/sparc/kernel/sun4d_irq.c b/arch/sparc/kernel/sun4d_irq.c index ab036a72de5..e11b4612dab 100644 --- a/arch/sparc/kernel/sun4d_irq.c +++ b/arch/sparc/kernel/sun4d_irq.c @@ -183,7 +183,7 @@ void sun4d_free_irq(unsigned int irq, void *dev_id) goto out_unlock; } - if (action && tmp) + if (tmp) tmp->next = action->next; else *actionp = action->next; diff --git a/arch/sparc/kernel/ttable.S b/arch/sparc/kernel/ttable.S index 76d837fc47d..c6dfdaa29e2 100644 --- a/arch/sparc/kernel/ttable.S +++ b/arch/sparc/kernel/ttable.S @@ -64,7 +64,7 @@ tl0_irq6: TRAP_IRQ(smp_call_function_single_client, 6) tl0_irq6: BTRAP(0x46) #endif tl0_irq7: TRAP_IRQ(deferred_pcr_work_irq, 7) -#ifdef CONFIG_KGDB +#if defined(CONFIG_KGDB) && defined(CONFIG_SMP) tl0_irq8: TRAP_IRQ(smp_kgdb_capture_client, 8) #else tl0_irq8: BTRAP(0x48) diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index f5f75a58e0b..b0b43aa5e45 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -2215,8 +2215,6 @@ void __init ld_mmu_srmmu(void) BTFIXUPSET_CALL(pmd_page, srmmu_pmd_page, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pgd_page_vaddr, srmmu_pgd_page, BTFIXUPCALL_NORM); - BTFIXUPSET_SETHI(none_mask, 0xF0000000); - BTFIXUPSET_CALL(pte_present, srmmu_pte_present, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pte_clear, srmmu_pte_clear, BTFIXUPCALL_SWAPO0G0); diff --git a/arch/sparc/mm/sun4c.c b/arch/sparc/mm/sun4c.c index cf38846753d..4289f90f869 100644 --- a/arch/sparc/mm/sun4c.c +++ b/arch/sparc/mm/sun4c.c @@ -2087,9 +2087,6 @@ void __init ld_mmu_sun4c(void) BTFIXUPSET_CALL(set_pte, sun4c_set_pte, BTFIXUPCALL_STO1O0); - /* The 2.4.18 code does not set this on sun4c, how does it work? XXX */ - /* BTFIXUPSET_SETHI(none_mask, 0x00000000); */ /* Defaults to zero? */ - BTFIXUPSET_CALL(pte_pfn, sun4c_pte_pfn, BTFIXUPCALL_NORM); #if 0 /* PAGE_SHIFT <= 12 */ /* Eek. Investigate. XXX */ BTFIXUPSET_CALL(pmd_page, sun4c_pmd_page, BTFIXUPCALL_ANDNINT(PAGE_SIZE - 1)); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index c02cc692985..a96489ee6ca 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -921,7 +921,7 @@ void disable_local_APIC(void) unsigned int value; /* APIC hasn't been mapped yet */ - if (!apic_phys) + if (!x2apic_mode && !apic_phys) return; clear_local_APIC(); diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index ebdb85cf268..e5cc7e82e60 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -18,6 +18,7 @@ #include <asm/apic.h> #include <asm/iommu.h> #include <asm/gart.h> +#include <asm/hpet.h> static void __init fix_hypertransport_config(int num, int slot, int func) { @@ -191,6 +192,21 @@ static void __init ati_bugs_contd(int num, int slot, int func) } #endif +/* + * Force the read back of the CMP register in hpet_next_event() + * to work around the problem that the CMP register write seems to be + * delayed. See hpet_next_event() for details. + * + * We do this on all SMBUS incarnations for now until we have more + * information about the affected chipsets. + */ +static void __init ati_hpet_bugs(int num, int slot, int func) +{ +#ifdef CONFIG_HPET_TIMER + hpet_readback_cmp = 1; +#endif +} + #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -220,6 +236,8 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, + { PCI_VENDOR_ID_ATI, PCI_ANY_ID, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs }, {} }; diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 345a4b1fe14..675879b65ce 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -640,8 +640,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) /* Skip cs, ip, orig_ax and gs. */ \ " subl $16, %esp\n" \ " pushl %fs\n" \ - " pushl %ds\n" \ " pushl %es\n" \ + " pushl %ds\n" \ " pushl %eax\n" \ " pushl %ebp\n" \ " pushl %edi\n" \ diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index e72d3fc6547..939b9e98245 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -498,15 +498,10 @@ void force_hpet_resume(void) * See erratum #27 (Misinterpreted MSI Requests May Result in * Corrupted LPC DMA Data) in AMD Publication #46837, * "SB700 Family Product Errata", Rev. 1.0, March 2010. - * - * Also force the read back of the CMP register in hpet_next_event() - * to work around the problem that the CMP register write seems to be - * delayed. See hpet_next_event() for details. */ static void force_disable_hpet_msi(struct pci_dev *unused) { hpet_msi_disable = 1; - hpet_readback_cmp = 1; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index de3b63ae3da..a60df9ae645 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -238,6 +238,15 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); + /* + * Ensure that the boot cpu numa_node is correct when the boot + * cpu is on a node that doesn't have memory installed. + * Also cpu_up() will call cpu_to_node() for APs when + * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set + * up later with c_init aka intel_init/amd_init. + * So set them all (boot cpu and all APs). + */ + set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); #endif #endif /* @@ -257,14 +266,6 @@ void __init setup_per_cpu_areas(void) early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) - /* - * make sure boot cpu numa_node is right, when boot cpu is on the - * node that doesn't have mem installed - */ - set_cpu_numa_node(boot_cpu_id, early_cpu_to_node(boot_cpu_id)); -#endif - /* Setup node to cpumask map */ setup_node_to_cpumask_map(); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3699613e883..b1ed0a1a591 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2926,7 +2926,7 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) return kvm_mmu_zap_page(kvm, page) + 1; } -static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) +static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { struct kvm *kvm; struct kvm *kvm_freed = NULL; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 6fdb3ec30c3..55253095be8 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -184,6 +184,7 @@ static void __init pcibios_allocate_resources(int pass) idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { /* We'll assign a new address later */ + dev->fw_addr[idx] = r->start; r->end -= r->start; r->start = 0; } diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c index 7ef3a2735df..cb29191cee5 100644 --- a/arch/x86/pci/mrst.c +++ b/arch/x86/pci/mrst.c @@ -66,8 +66,9 @@ static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn) devfn, pos, 4, &pcie_cap)) return 0; - if (pcie_cap == 0xffffffff) - return 0; + if (PCI_EXT_CAP_ID(pcie_cap) == 0x0000 || + PCI_EXT_CAP_ID(pcie_cap) == 0xffff) + break; if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) { raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number, @@ -76,7 +77,7 @@ static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn) return pos; } - pos = pcie_cap >> 20; + pos = PCI_EXT_CAP_NEXT(pcie_cap); } return 0; diff --git a/drivers/clocksource/cs5535-clockevt.c b/drivers/clocksource/cs5535-clockevt.c index d7be69f1315..b7dab32ce63 100644 --- a/drivers/clocksource/cs5535-clockevt.c +++ b/drivers/clocksource/cs5535-clockevt.c @@ -194,6 +194,6 @@ err_timer: module_init(cs5535_mfgpt_init); -MODULE_AUTHOR("Andres Salomon <dilinger@collabora.co.uk>"); +MODULE_AUTHOR("Andres Salomon <dilinger@queued.net>"); MODULE_DESCRIPTION("CS5535/CS5536 MFGPT clock event driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index aedef7941b2..0d2f9dbb47e 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -209,7 +209,7 @@ config EDAC_I5100 config EDAC_MPC85XX tristate "Freescale MPC83xx / MPC85xx" - depends on EDAC_MM_EDAC && FSL_SOC && (PPC_83xx || MPC85xx) + depends on EDAC_MM_EDAC && FSL_SOC && (PPC_83xx || PPC_85xx) help Support for error detection and correction on the Freescale MPC8349, MPC8560, MPC8540, MPC8548 diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c index 52ca09bf472..f39b00a46ed 100644 --- a/drivers/edac/mpc85xx_edac.c +++ b/drivers/edac/mpc85xx_edac.c @@ -1120,6 +1120,7 @@ static struct of_device_id mpc85xx_mc_err_of_match[] = { { .compatible = "fsl,mpc8555-memory-controller", }, { .compatible = "fsl,mpc8560-memory-controller", }, { .compatible = "fsl,mpc8568-memory-controller", }, + { .compatible = "fsl,mpc8569-memory-controller", }, { .compatible = "fsl,mpc8572-memory-controller", }, { .compatible = "fsl,mpc8349-memory-controller", }, { .compatible = "fsl,p2020-memory-controller", }, diff --git a/drivers/gpio/cs5535-gpio.c b/drivers/gpio/cs5535-gpio.c index f73a1555e49..e23c06893d1 100644 --- a/drivers/gpio/cs5535-gpio.c +++ b/drivers/gpio/cs5535-gpio.c @@ -352,6 +352,6 @@ static void __exit cs5535_gpio_exit(void) module_init(cs5535_gpio_init); module_exit(cs5535_gpio_exit); -MODULE_AUTHOR("Andres Salomon <dilinger@collabora.co.uk>"); +MODULE_AUTHOR("Andres Salomon <dilinger@queued.net>"); MODULE_DESCRIPTION("AMD CS5535/CS5536 GPIO driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 074385882cc..51bd301cf10 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2241,6 +2241,7 @@ i915_gem_object_get_pages(struct drm_gem_object *obj, page = read_cache_page_gfp(mapping, i, GFP_HIGHUSER | __GFP_COLD | + __GFP_RECLAIMABLE | gfpmask); if (IS_ERR(page)) goto err_pages; @@ -4741,6 +4742,16 @@ i915_gem_load(struct drm_device *dev) list_add(&dev_priv->mm.shrink_list, &shrink_list); spin_unlock(&shrink_list_lock); + /* On GEN3 we really need to make sure the ARB C3 LP bit is set */ + if (IS_GEN3(dev)) { + u32 tmp = I915_READ(MI_ARB_STATE); + if (!(tmp & MI_ARB_C3_LP_WRITE_ENABLE)) { + /* arb state is a masked write, so set bit + bit in mask */ + tmp = MI_ARB_C3_LP_WRITE_ENABLE | (MI_ARB_C3_LP_WRITE_ENABLE << MI_ARB_MASK_SHIFT); + I915_WRITE(MI_ARB_STATE, tmp); + } + } + /* Old X drivers will take 0-2 for front, back, depth buffers */ if (!drm_core_check_feature(dev, DRIVER_MODESET)) dev_priv->fence_reg_start = 3; @@ -4977,7 +4988,7 @@ i915_gpu_is_active(struct drm_device *dev) } static int -i915_gem_shrink(int nr_to_scan, gfp_t gfp_mask) +i915_gem_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { drm_i915_private_t *dev_priv, *next_dev; struct drm_i915_gem_object *obj_priv, *next_obj; diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 150400f4053..6d9b0288272 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -359,6 +359,70 @@ #define LM_BURST_LENGTH 0x00000700 #define LM_FIFO_WATERMARK 0x0000001F #define MI_ARB_STATE 0x020e4 /* 915+ only */ +#define MI_ARB_MASK_SHIFT 16 /* shift for enable bits */ + +/* Make render/texture TLB fetches lower priorty than associated data + * fetches. This is not turned on by default + */ +#define MI_ARB_RENDER_TLB_LOW_PRIORITY (1 << 15) + +/* Isoch request wait on GTT enable (Display A/B/C streams). + * Make isoch requests stall on the TLB update. May cause + * display underruns (test mode only) + */ +#define MI_ARB_ISOCH_WAIT_GTT (1 << 14) + +/* Block grant count for isoch requests when block count is + * set to a finite value. + */ +#define MI_ARB_BLOCK_GRANT_MASK (3 << 12) +#define MI_ARB_BLOCK_GRANT_8 (0 << 12) /* for 3 display planes */ +#define MI_ARB_BLOCK_GRANT_4 (1 << 12) /* for 2 display planes */ +#define MI_ARB_BLOCK_GRANT_2 (2 << 12) /* for 1 display plane */ +#define MI_ARB_BLOCK_GRANT_0 (3 << 12) /* don't use */ + +/* Enable render writes to complete in C2/C3/C4 power states. + * If this isn't enabled, render writes are prevented in low + * power states. That seems bad to me. + */ +#define MI_ARB_C3_LP_WRITE_ENABLE (1 << 11) + +/* This acknowledges an async flip immediately instead + * of waiting for 2TLB fetches. + */ +#define MI_ARB_ASYNC_FLIP_ACK_IMMEDIATE (1 << 10) + +/* Enables non-sequential data reads through arbiter + */ +#define MI_ARB_DUAL_DATA_PHASE_DISABLE (1 << 9) + +/* Disable FSB snooping of cacheable write cycles from binner/render + * command stream + */ +#define MI_ARB_CACHE_SNOOP_DISABLE (1 << 8) + +/* Arbiter time slice for non-isoch streams */ +#define MI_ARB_TIME_SLICE_MASK (7 << 5) +#define MI_ARB_TIME_SLICE_1 (0 << 5) +#define MI_ARB_TIME_SLICE_2 (1 << 5) +#define MI_ARB_TIME_SLICE_4 (2 << 5) +#define MI_ARB_TIME_SLICE_6 (3 << 5) +#define MI_ARB_TIME_SLICE_8 (4 << 5) +#define MI_ARB_TIME_SLICE_10 (5 << 5) +#define MI_ARB_TIME_SLICE_14 (6 << 5) +#define MI_ARB_TIME_SLICE_16 (7 << 5) + +/* Low priority grace period page size */ +#define MI_ARB_LOW_PRIORITY_GRACE_4KB (0 << 4) /* default */ +#define MI_ARB_LOW_PRIORITY_GRACE_8KB (1 << 4) + +/* Disable display A/B trickle feed */ +#define MI_ARB_DISPLAY_TRICKLE_FEED_DISABLE (1 << 2) + +/* Set display plane priority */ +#define MI_ARB_DISPLAY_PRIORITY_A_B (0 << 0) /* display A > display B */ +#define MI_ARB_DISPLAY_PRIORITY_B_A (1 << 0) /* display B > display A */ + #define CACHE_MODE_0 0x02120 /* 915+ only */ #define CM0_MASK_SHIFT 16 #define CM0_IZ_OPT_DISABLE (1<<6) diff --git a/drivers/gpu/drm/nouveau/nouveau_bios.c b/drivers/gpu/drm/nouveau/nouveau_bios.c index fc924b64919..e492919faf4 100644 --- a/drivers/gpu/drm/nouveau/nouveau_bios.c +++ b/drivers/gpu/drm/nouveau/nouveau_bios.c @@ -203,36 +203,26 @@ struct methods { const bool rw; }; -static struct methods nv04_methods[] = { - { "PROM", load_vbios_prom, false }, - { "PRAMIN", load_vbios_pramin, true }, - { "PCIROM", load_vbios_pci, true }, -}; - -static struct methods nv50_methods[] = { - { "ACPI", load_vbios_acpi, true }, +static struct methods shadow_methods[] = { { "PRAMIN", load_vbios_pramin, true }, { "PROM", load_vbios_prom, false }, { "PCIROM", load_vbios_pci, true }, + { "ACPI", load_vbios_acpi, true }, }; -#define METHODCNT 3 - static bool NVShadowVBIOS(struct drm_device *dev, uint8_t *data) { - struct drm_nouveau_private *dev_priv = dev->dev_private; - struct methods *methods; - int i; + const int nr_methods = ARRAY_SIZE(shadow_methods); + struct methods *methods = shadow_methods; int testscore = 3; - int scores[METHODCNT]; + int scores[nr_methods], i; if (nouveau_vbios) { - methods = nv04_methods; - for (i = 0; i < METHODCNT; i++) + for (i = 0; i < nr_methods; i++) if (!strcasecmp(nouveau_vbios, methods[i].desc)) break; - if (i < METHODCNT) { + if (i < nr_methods) { NV_INFO(dev, "Attempting to use BIOS image from %s\n", methods[i].desc); @@ -244,12 +234,7 @@ static bool NVShadowVBIOS(struct drm_device *dev, uint8_t *data) NV_ERROR(dev, "VBIOS source \'%s\' invalid\n", nouveau_vbios); } - if (dev_priv->card_type < NV_50) - methods = nv04_methods; - else - methods = nv50_methods; - - for (i = 0; i < METHODCNT; i++) { + for (i = 0; i < nr_methods; i++) { NV_TRACE(dev, "Attempting to load BIOS image from %s\n", methods[i].desc); data[0] = data[1] = 0; /* avoid reuse of previous image */ @@ -260,7 +245,7 @@ static bool NVShadowVBIOS(struct drm_device *dev, uint8_t *data) } while (--testscore > 0) { - for (i = 0; i < METHODCNT; i++) { + for (i = 0; i < nr_methods; i++) { if (scores[i] == testscore) { NV_TRACE(dev, "Using BIOS image from %s\n", methods[i].desc); diff --git a/drivers/gpu/drm/nouveau/nouveau_fbcon.c b/drivers/gpu/drm/nouveau/nouveau_fbcon.c index c9a4a0d2a11..257ea130ae1 100644 --- a/drivers/gpu/drm/nouveau/nouveau_fbcon.c +++ b/drivers/gpu/drm/nouveau/nouveau_fbcon.c @@ -387,7 +387,8 @@ int nouveau_fbcon_init(struct drm_device *dev) dev_priv->nfbdev = nfbdev; nfbdev->helper.funcs = &nouveau_fbcon_helper_funcs; - ret = drm_fb_helper_init(dev, &nfbdev->helper, 2, 4); + ret = drm_fb_helper_init(dev, &nfbdev->helper, + nv_two_heads(dev) ? 2 : 1, 4); if (ret) { kfree(nfbdev); return ret; diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c index 3970e62eaab..aab5ba040bd 100644 --- a/drivers/gpu/drm/radeon/r100.c +++ b/drivers/gpu/drm/radeon/r100.c @@ -2354,6 +2354,7 @@ void r100_mc_init(struct radeon_device *rdev) if (rdev->flags & RADEON_IS_IGP) base = (RREG32(RADEON_NB_TOM) & 0xffff) << 16; radeon_vram_location(rdev, &rdev->mc, base); + rdev->mc.gtt_base_align = 0; if (!(rdev->flags & RADEON_IS_AGP)) radeon_gtt_location(rdev, &rdev->mc); radeon_update_bandwidth_info(rdev); diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c index 7e81db5eb80..19a7ef7ee34 100644 --- a/drivers/gpu/drm/radeon/r300.c +++ b/drivers/gpu/drm/radeon/r300.c @@ -481,6 +481,7 @@ void r300_mc_init(struct radeon_device *rdev) if (rdev->flags & RADEON_IS_IGP) base = (RREG32(RADEON_NB_TOM) & 0xffff) << 16; radeon_vram_location(rdev, &rdev->mc, base); + rdev->mc.gtt_base_align = 0; if (!(rdev->flags & RADEON_IS_AGP)) radeon_gtt_location(rdev, &rdev->mc); radeon_update_bandwidth_info(rdev); @@ -1176,6 +1177,8 @@ int r300_cs_parse(struct radeon_cs_parser *p) int r; track = kzalloc(sizeof(*track), GFP_KERNEL); + if (track == NULL) + return -ENOMEM; r100_cs_track_clear(p->rdev, track); p->track = track; do { diff --git a/drivers/gpu/drm/radeon/r520.c b/drivers/gpu/drm/radeon/r520.c index 34330df2848..694af7cc23a 100644 --- a/drivers/gpu/drm/radeon/r520.c +++ b/drivers/gpu/drm/radeon/r520.c @@ -125,6 +125,7 @@ void r520_mc_init(struct radeon_device *rdev) r520_vram_get_type(rdev); r100_vram_init_sizes(rdev); radeon_vram_location(rdev, &rdev->mc, 0); + rdev->mc.gtt_base_align = 0; if (!(rdev->flags & RADEON_IS_AGP)) radeon_gtt_location(rdev, &rdev->mc); radeon_update_bandwidth_info(rdev); diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index 3d6645ce215..e100f69faee 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -1179,6 +1179,7 @@ void r600_vram_gtt_location(struct radeon_device *rdev, struct radeon_mc *mc) if (rdev->flags & RADEON_IS_IGP) base = (RREG32(MC_VM_FB_LOCATION) & 0xFFFF) << 24; radeon_vram_location(rdev, &rdev->mc, base); + rdev->mc.gtt_base_align = 0; radeon_gtt_location(rdev, mc); } } diff --git a/drivers/gpu/drm/radeon/r600_blit.c b/drivers/gpu/drm/radeon/r600_blit.c index f4fb88ece2b..ca5c29f7077 100644 --- a/drivers/gpu/drm/radeon/r600_blit.c +++ b/drivers/gpu/drm/radeon/r600_blit.c @@ -538,9 +538,12 @@ int r600_prepare_blit_copy(struct drm_device *dev, struct drm_file *file_priv) { drm_radeon_private_t *dev_priv = dev->dev_private; + int ret; DRM_DEBUG("\n"); - r600_nomm_get_vb(dev); + ret = r600_nomm_get_vb(dev); + if (ret) + return ret; dev_priv->blit_vb->file_priv = file_priv; diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index ab61aaa887b..2f94dc66c18 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -351,6 +351,7 @@ struct radeon_mc { int vram_mtrr; bool vram_is_ddr; bool igp_sideport_enabled; + u64 gtt_base_align; }; bool radeon_combios_sideport_present(struct radeon_device *rdev); diff --git a/drivers/gpu/drm/radeon/radeon_atombios.c b/drivers/gpu/drm/radeon/radeon_atombios.c index 99bd8a9c56b..10673ae59cf 100644 --- a/drivers/gpu/drm/radeon/radeon_atombios.c +++ b/drivers/gpu/drm/radeon/radeon_atombios.c @@ -280,6 +280,15 @@ static bool radeon_atom_apply_quirks(struct drm_device *dev, } } + /* ASUS HD 3600 board lists the DVI port as HDMI */ + if ((dev->pdev->device == 0x9598) && + (dev->pdev->subsystem_vendor == 0x1043) && + (dev->pdev->subsystem_device == 0x01e4)) { + if (*connector_type == DRM_MODE_CONNECTOR_HDMIA) { + *connector_type = DRM_MODE_CONNECTOR_DVII; + } + } + /* ASUS HD 3450 board lists the DVI port as HDMI */ if ((dev->pdev->device == 0x95C5) && (dev->pdev->subsystem_vendor == 0x1043) && @@ -1029,8 +1038,15 @@ bool radeon_atombios_sideport_present(struct radeon_device *rdev) data_offset); switch (crev) { case 1: - if (igp_info->info.ucMemoryType & 0xf0) - return true; + /* AMD IGPS */ + if ((rdev->family == CHIP_RS690) || + (rdev->family == CHIP_RS740)) { + if (igp_info->info.ulBootUpMemoryClock) + return true; + } else { + if (igp_info->info.ucMemoryType & 0xf0) + return true; + } break; case 2: if (igp_info->info_2.ucMemoryType & 0x0f) diff --git a/drivers/gpu/drm/radeon/radeon_connectors.c b/drivers/gpu/drm/radeon/radeon_connectors.c index f58f8bd8f77..adccbc2c202 100644 --- a/drivers/gpu/drm/radeon/radeon_connectors.c +++ b/drivers/gpu/drm/radeon/radeon_connectors.c @@ -771,14 +771,14 @@ static enum drm_connector_status radeon_dvi_detect(struct drm_connector *connect } else ret = connector_status_connected; - /* multiple connectors on the same encoder with the same ddc line - * This tends to be HDMI and DVI on the same encoder with the - * same ddc line. If the edid says HDMI, consider the HDMI port - * connected and the DVI port disconnected. If the edid doesn't - * say HDMI, vice versa. + /* This gets complicated. We have boards with VGA + HDMI with a + * shared DDC line and we have boards with DVI-D + HDMI with a shared + * DDC line. The latter is more complex because with DVI<->HDMI adapters + * you don't really know what's connected to which port as both are digital. */ if (radeon_connector->shared_ddc && (ret == connector_status_connected)) { struct drm_device *dev = connector->dev; + struct radeon_device *rdev = dev->dev_private; struct drm_connector *list_connector; struct radeon_connector *list_radeon_connector; list_for_each_entry(list_connector, &dev->mode_config.connector_list, head) { @@ -788,15 +788,10 @@ static enum drm_connector_status radeon_dvi_detect(struct drm_connector *connect if (list_radeon_connector->shared_ddc && (list_radeon_connector->ddc_bus->rec.i2c_id == radeon_connector->ddc_bus->rec.i2c_id)) { - if (drm_detect_hdmi_monitor(radeon_connector->edid)) { - if (connector->connector_type == DRM_MODE_CONNECTOR_DVID) { - kfree(radeon_connector->edid); - radeon_connector->edid = NULL; - ret = connector_status_disconnected; - } - } else { - if ((connector->connector_type == DRM_MODE_CONNECTOR_HDMIA) || - (connector->connector_type == DRM_MODE_CONNECTOR_HDMIB)) { + /* cases where both connectors are digital */ + if (list_connector->connector_type != DRM_MODE_CONNECTOR_VGA) { + /* hpd is our only option in this case */ + if (!radeon_hpd_sense(rdev, radeon_connector->hpd.hpd)) { kfree(radeon_connector->edid); radeon_connector->edid = NULL; ret = connector_status_disconnected; diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 5f317317aba..dd279da9054 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -226,20 +226,20 @@ void radeon_gtt_location(struct radeon_device *rdev, struct radeon_mc *mc) { u64 size_af, size_bf; - size_af = 0xFFFFFFFF - mc->vram_end; - size_bf = mc->vram_start; + size_af = ((0xFFFFFFFF - mc->vram_end) + mc->gtt_base_align) & ~mc->gtt_base_align; + size_bf = mc->vram_start & ~mc->gtt_base_align; if (size_bf > size_af) { if (mc->gtt_size > size_bf) { dev_warn(rdev->dev, "limiting GTT\n"); mc->gtt_size = size_bf; } - mc->gtt_start = mc->vram_start - mc->gtt_size; + mc->gtt_start = (mc->vram_start & ~mc->gtt_base_align) - mc->gtt_size; } else { if (mc->gtt_size > size_af) { dev_warn(rdev->dev, "limiting GTT\n"); mc->gtt_size = size_af; } - mc->gtt_start = mc->vram_end + 1; + mc->gtt_start = (mc->vram_end + 1 + mc->gtt_base_align) & ~mc->gtt_base_align; } mc->gtt_end = mc->gtt_start + mc->gtt_size - 1; dev_info(rdev->dev, "GTT: %lluM 0x%08llX - 0x%08llX\n", diff --git a/drivers/gpu/drm/radeon/radeon_legacy_tv.c b/drivers/gpu/drm/radeon/radeon_legacy_tv.c index f2ed27c8055..03204039774 100644 --- a/drivers/gpu/drm/radeon/radeon_legacy_tv.c +++ b/drivers/gpu/drm/radeon/radeon_legacy_tv.c @@ -642,8 +642,8 @@ void radeon_legacy_tv_mode_set(struct drm_encoder *encoder, } flicker_removal = (tmp + 500) / 1000; - if (flicker_removal < 2) - flicker_removal = 2; + if (flicker_removal < 3) + flicker_removal = 3; for (i = 0; i < ARRAY_SIZE(SLOPE_limit); ++i) { if (flicker_removal == SLOPE_limit[i]) break; diff --git a/drivers/gpu/drm/radeon/rs400.c b/drivers/gpu/drm/radeon/rs400.c index 9e4240b3bf0..f454c9a5e7f 100644 --- a/drivers/gpu/drm/radeon/rs400.c +++ b/drivers/gpu/drm/radeon/rs400.c @@ -57,7 +57,9 @@ void rs400_gart_adjust_size(struct radeon_device *rdev) } if (rdev->family == CHIP_RS400 || rdev->family == CHIP_RS480) { /* FIXME: RS400 & RS480 seems to have issue with GART size - * if 4G of system memory (needs more testing) */ + * if 4G of system memory (needs more testing) + */ + /* XXX is this still an issue with proper alignment? */ rdev->mc.gtt_size = 32 * 1024 * 1024; DRM_ERROR("Forcing to 32M GART size (because of ASIC bug ?)\n"); } @@ -263,6 +265,7 @@ void rs400_mc_init(struct radeon_device *rdev) r100_vram_init_sizes(rdev); base = (RREG32(RADEON_NB_TOM) & 0xffff) << 16; radeon_vram_location(rdev, &rdev->mc, base); + rdev->mc.gtt_base_align = rdev->mc.gtt_size - 1; radeon_gtt_location(rdev, &rdev->mc); radeon_update_bandwidth_info(rdev); } diff --git a/drivers/gpu/drm/radeon/rs600.c b/drivers/gpu/drm/radeon/rs600.c index 7bb4c3e52f3..6dc15ea8ba3 100644 --- a/drivers/gpu/drm/radeon/rs600.c +++ b/drivers/gpu/drm/radeon/rs600.c @@ -698,6 +698,7 @@ void rs600_mc_init(struct radeon_device *rdev) base = G_000004_MC_FB_START(base) << 16; rdev->mc.igp_sideport_enabled = radeon_atombios_sideport_present(rdev); radeon_vram_location(rdev, &rdev->mc, base); + rdev->mc.gtt_base_align = 0; radeon_gtt_location(rdev, &rdev->mc); radeon_update_bandwidth_info(rdev); } diff --git a/drivers/gpu/drm/radeon/rs690.c b/drivers/gpu/drm/radeon/rs690.c index f4f0a61bcdc..ce4ecbe1081 100644 --- a/drivers/gpu/drm/radeon/rs690.c +++ b/drivers/gpu/drm/radeon/rs690.c @@ -162,6 +162,7 @@ void rs690_mc_init(struct radeon_device *rdev) rs690_pm_info(rdev); rdev->mc.igp_sideport_enabled = radeon_atombios_sideport_present(rdev); radeon_vram_location(rdev, &rdev->mc, base); + rdev->mc.gtt_base_align = rdev->mc.gtt_size - 1; radeon_gtt_location(rdev, &rdev->mc); radeon_update_bandwidth_info(rdev); } diff --git a/drivers/gpu/drm/radeon/rv515.c b/drivers/gpu/drm/radeon/rv515.c index 7d9a7b0a180..0c9c169a685 100644 --- a/drivers/gpu/drm/radeon/rv515.c +++ b/drivers/gpu/drm/radeon/rv515.c @@ -195,6 +195,7 @@ void rv515_mc_init(struct radeon_device *rdev) rv515_vram_get_type(rdev); r100_vram_init_sizes(rdev); radeon_vram_location(rdev, &rdev->mc, 0); + rdev->mc.gtt_base_align = 0; if (!(rdev->flags & RADEON_IS_AGP)) radeon_gtt_location(rdev, &rdev->mc); radeon_update_bandwidth_info(rdev); diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c index b1d67dc973d..d233c65f3f7 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c @@ -40,7 +40,9 @@ #include <linux/slab.h> #include <asm/atomic.h> +#ifdef TTM_HAS_AGP #include <asm/agp.h> +#endif #include "ttm/ttm_bo_driver.h" #include "ttm/ttm_page_alloc.h" @@ -392,7 +394,7 @@ static int ttm_pool_get_num_unused_pages(void) /** * Callback for mm to request pool to reduce number of page held. */ -static int ttm_pool_mm_shrink(int shrink_pages, gfp_t gfp_mask) +static int ttm_pool_mm_shrink(struct shrinker *shrink, int shrink_pages, gfp_t gfp_mask) { static atomic_t start_pool = ATOMIC_INIT(0); unsigned i; diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c index f1d62611241..437ac786277 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_kms.c @@ -972,6 +972,7 @@ int vmw_kms_update_layout_ioctl(struct drm_device *dev, void *data, ret = copy_from_user(rects, user_rects, rects_size); if (unlikely(ret != 0)) { DRM_ERROR("Failed to get rects.\n"); + ret = -EFAULT; goto out_free; } diff --git a/drivers/misc/cs5535-mfgpt.c b/drivers/misc/cs5535-mfgpt.c index 9bec24db4d4..2d44b330010 100644 --- a/drivers/misc/cs5535-mfgpt.c +++ b/drivers/misc/cs5535-mfgpt.c @@ -366,6 +366,6 @@ static int __init cs5535_mfgpt_init(void) module_init(cs5535_mfgpt_init); -MODULE_AUTHOR("Andres Salomon <dilinger@collabora.co.uk>"); +MODULE_AUTHOR("Andres Salomon <dilinger@queued.net>"); MODULE_DESCRIPTION("CS5535/CS5536 MFGPT timer driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/mmc/host/sdhci-s3c.c b/drivers/mmc/host/sdhci-s3c.c index af217924a76..ad30f074ee1 100644 --- a/drivers/mmc/host/sdhci-s3c.c +++ b/drivers/mmc/host/sdhci-s3c.c @@ -365,6 +365,26 @@ static int __devinit sdhci_s3c_probe(struct platform_device *pdev) static int __devexit sdhci_s3c_remove(struct platform_device *pdev) { + struct sdhci_host *host = platform_get_drvdata(pdev); + struct sdhci_s3c *sc = sdhci_priv(host); + int ptr; + + sdhci_remove_host(host, 1); + + for (ptr = 0; ptr < 3; ptr++) { + clk_disable(sc->clk_bus[ptr]); + clk_put(sc->clk_bus[ptr]); + } + clk_disable(sc->clk_io); + clk_put(sc->clk_io); + + iounmap(host->ioaddr); + release_resource(sc->ioarea); + kfree(sc->ioarea); + + sdhci_free_host(host); + platform_set_drvdata(pdev, NULL); + return 0; } diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c index 7acb3edc47e..2602852cc55 100644 --- a/drivers/net/ibmveth.c +++ b/drivers/net/ibmveth.c @@ -677,7 +677,7 @@ static int ibmveth_close(struct net_device *netdev) if (!adapter->pool_config) netif_stop_queue(netdev); - free_irq(netdev->irq, netdev); + h_vio_signal(adapter->vdev->unit_address, VIO_IRQ_DISABLE); do { lpar_rc = h_free_logical_lan(adapter->vdev->unit_address); @@ -689,6 +689,8 @@ static int ibmveth_close(struct net_device *netdev) lpar_rc); } + free_irq(netdev->irq, netdev); + adapter->rx_no_buffer = *(u64*)(((char*)adapter->buffer_list_addr) + 4096 - 8); ibmveth_cleanup(adapter); diff --git a/drivers/net/pcmcia/axnet_cs.c b/drivers/net/pcmcia/axnet_cs.c index 5b3dfb4ab27..33525bf2a3d 100644 --- a/drivers/net/pcmcia/axnet_cs.c +++ b/drivers/net/pcmcia/axnet_cs.c @@ -1168,6 +1168,7 @@ static irqreturn_t ax_interrupt(int irq, void *dev_id) int interrupts, nr_serviced = 0, i; struct ei_device *ei_local; int handled = 0; + unsigned long flags; e8390_base = dev->base_addr; ei_local = netdev_priv(dev); @@ -1176,7 +1177,7 @@ static irqreturn_t ax_interrupt(int irq, void *dev_id) * Protect the irq test too. */ - spin_lock(&ei_local->page_lock); + spin_lock_irqsave(&ei_local->page_lock, flags); if (ei_local->irqlock) { @@ -1188,7 +1189,7 @@ static irqreturn_t ax_interrupt(int irq, void *dev_id) dev->name, inb_p(e8390_base + EN0_ISR), inb_p(e8390_base + EN0_IMR)); #endif - spin_unlock(&ei_local->page_lock); + spin_unlock_irqrestore(&ei_local->page_lock, flags); return IRQ_NONE; } @@ -1261,7 +1262,7 @@ static irqreturn_t ax_interrupt(int irq, void *dev_id) ei_local->irqlock = 0; outb_p(ENISR_ALL, e8390_base + EN0_IMR); - spin_unlock(&ei_local->page_lock); + spin_unlock_irqrestore(&ei_local->page_lock, flags); return IRQ_RETVAL(handled); } diff --git a/drivers/net/r8169.c b/drivers/net/r8169.c index 96b6cfbf0a3..cdc6a5c2e70 100644 --- a/drivers/net/r8169.c +++ b/drivers/net/r8169.c @@ -1316,7 +1316,7 @@ static void rtl8169_get_mac_version(struct rtl8169_private *tp, { 0x7c800000, 0x28000000, RTL_GIGA_MAC_VER_26 }, /* 8168C family. */ - { 0x7cf00000, 0x3ca00000, RTL_GIGA_MAC_VER_24 }, + { 0x7cf00000, 0x3cb00000, RTL_GIGA_MAC_VER_24 }, { 0x7cf00000, 0x3c900000, RTL_GIGA_MAC_VER_23 }, { 0x7cf00000, 0x3c800000, RTL_GIGA_MAC_VER_18 }, { 0x7c800000, 0x3c800000, RTL_GIGA_MAC_VER_24 }, diff --git a/drivers/net/wireless/ath/ath9k/hif_usb.c b/drivers/net/wireless/ath/ath9k/hif_usb.c index 77b359162d6..23c15aa9fbd 100644 --- a/drivers/net/wireless/ath/ath9k/hif_usb.c +++ b/drivers/net/wireless/ath/ath9k/hif_usb.c @@ -730,13 +730,17 @@ static int ath9k_hif_usb_alloc_urbs(struct hif_device_usb *hif_dev) /* RX */ if (ath9k_hif_usb_alloc_rx_urbs(hif_dev) < 0) - goto err; + goto err_rx; /* Register Read */ if (ath9k_hif_usb_alloc_reg_in_urb(hif_dev) < 0) - goto err; + goto err_reg; return 0; +err_reg: + ath9k_hif_usb_dealloc_rx_urbs(hif_dev); +err_rx: + ath9k_hif_usb_dealloc_tx_urbs(hif_dev); err: return -ENOMEM; } diff --git a/drivers/net/wireless/hostap/hostap_pci.c b/drivers/net/wireless/hostap/hostap_pci.c index d24dc7dc072..972a9c3af39 100644 --- a/drivers/net/wireless/hostap/hostap_pci.c +++ b/drivers/net/wireless/hostap/hostap_pci.c @@ -330,6 +330,7 @@ static int prism2_pci_probe(struct pci_dev *pdev, dev->irq = pdev->irq; hw_priv->mem_start = mem; + dev->base_addr = (unsigned long) mem; prism2_pci_cor_sreset(local); diff --git a/drivers/net/wireless/iwlwifi/iwl-sta.h b/drivers/net/wireless/iwlwifi/iwl-sta.h index c2a453a1a99..dc43ebd1f1f 100644 --- a/drivers/net/wireless/iwlwifi/iwl-sta.h +++ b/drivers/net/wireless/iwlwifi/iwl-sta.h @@ -97,6 +97,17 @@ static inline void iwl_clear_driver_stations(struct iwl_priv *priv) spin_lock_irqsave(&priv->sta_lock, flags); memset(priv->stations, 0, sizeof(priv->stations)); priv->num_stations = 0; + + /* + * Remove all key information that is not stored as part of station + * information since mac80211 may not have had a + * chance to remove all the keys. When device is reconfigured by + * mac80211 after an error all keys will be reconfigured. + */ + priv->ucode_key_table = 0; + priv->key_mapping_key = 0; + memset(priv->wep_keys, 0, sizeof(priv->wep_keys)); + spin_unlock_irqrestore(&priv->sta_lock, flags); } diff --git a/drivers/net/wireless/rt2x00/rt2x00dev.c b/drivers/net/wireless/rt2x00/rt2x00dev.c index 3ae468c4d76..f20d3eeeea7 100644 --- a/drivers/net/wireless/rt2x00/rt2x00dev.c +++ b/drivers/net/wireless/rt2x00/rt2x00dev.c @@ -854,6 +854,11 @@ int rt2x00lib_probe_dev(struct rt2x00_dev *rt2x00dev) BIT(NL80211_IFTYPE_WDS); /* + * Initialize configuration work. + */ + INIT_WORK(&rt2x00dev->intf_work, rt2x00lib_intf_scheduled); + + /* * Let the driver probe the device to detect the capabilities. */ retval = rt2x00dev->ops->lib->probe_hw(rt2x00dev); @@ -863,11 +868,6 @@ int rt2x00lib_probe_dev(struct rt2x00_dev *rt2x00dev) } /* - * Initialize configuration work. - */ - INIT_WORK(&rt2x00dev->intf_work, rt2x00lib_intf_scheduled); - - /* * Allocate queue array. */ retval = rt2x00queue_allocate(rt2x00dev); diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index 92379e2d37e..2aaa13150de 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -156,6 +156,38 @@ static int __pci_assign_resource(struct pci_bus *bus, struct pci_dev *dev, pcibios_align_resource, dev); } + if (ret < 0 && dev->fw_addr[resno]) { + struct resource *root, *conflict; + resource_size_t start, end; + + /* + * If we failed to assign anything, let's try the address + * where firmware left it. That at least has a chance of + * working, which is better than just leaving it disabled. + */ + + if (res->flags & IORESOURCE_IO) + root = &ioport_resource; + else + root = &iomem_resource; + + start = res->start; + end = res->end; + res->start = dev->fw_addr[resno]; + res->end = res->start + size - 1; + dev_info(&dev->dev, "BAR %d: trying firmware assignment %pR\n", + resno, res); + conflict = request_resource_conflict(root, res); + if (conflict) { + dev_info(&dev->dev, + "BAR %d: %pR conflicts with %s %pR\n", resno, + res, conflict->name, conflict); + res->start = start; + res->end = end; + } else + ret = 0; + } + if (!ret) { res->flags &= ~IORESOURCE_STARTALIGN; dev_info(&dev->dev, "BAR %d: assigned %pR\n", resno, res); diff --git a/drivers/pcmcia/pcmcia_resource.c b/drivers/pcmcia/pcmcia_resource.c index 29f91fac1df..a4cd9adfcbc 100644 --- a/drivers/pcmcia/pcmcia_resource.c +++ b/drivers/pcmcia/pcmcia_resource.c @@ -857,8 +857,10 @@ void pcmcia_disable_device(struct pcmcia_device *p_dev) { pcmcia_release_configuration(p_dev); pcmcia_release_io(p_dev, &p_dev->io); - if (p_dev->_irq) + if (p_dev->_irq) { free_irq(p_dev->irq, p_dev->priv); + p_dev->_irq = 0; + } if (p_dev->win) pcmcia_release_window(p_dev, p_dev->win); } diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c index 40658e3385b..bb2f1fba637 100644 --- a/drivers/platform/x86/intel_scu_ipc.c +++ b/drivers/platform/x86/intel_scu_ipc.c @@ -489,7 +489,7 @@ int intel_scu_ipc_simple_command(int cmd, int sub) mutex_unlock(&ipclock); return -ENODEV; } - ipc_command(cmd << 12 | sub); + ipc_command(sub << 12 | cmd); err = busy_loop(); mutex_unlock(&ipclock); return err; @@ -501,9 +501,9 @@ EXPORT_SYMBOL(intel_scu_ipc_simple_command); * @cmd: command * @sub: sub type * @in: input data - * @inlen: input length + * @inlen: input length in dwords * @out: output data - * @outlein: output length + * @outlein: output length in dwords * * Issue a command to the SCU which involves data transfers. Do the * data copies under the lock but leave it for the caller to interpret @@ -524,7 +524,7 @@ int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, for (i = 0; i < inlen; i++) ipc_data_writel(*in++, 4 * i); - ipc_command((cmd << 12) | sub | (inlen << 18)); + ipc_command((sub << 12) | cmd | (inlen << 18)); err = busy_loop(); for (i = 0; i < outlen; i++) @@ -556,6 +556,10 @@ int intel_scu_ipc_i2c_cntrl(u32 addr, u32 *data) u32 cmd = 0; mutex_lock(&ipclock); + if (ipcdev.pdev == NULL) { + mutex_unlock(&ipclock); + return -ENODEV; + } cmd = (addr >> 24) & 0xFF; if (cmd == IPC_I2C_READ) { writel(addr, ipcdev.i2c_base + IPC_I2C_CNTRL_ADDR); diff --git a/drivers/power/ds2782_battery.c b/drivers/power/ds2782_battery.c index d762a0cbc6a..2afbeec8b79 100644 --- a/drivers/power/ds2782_battery.c +++ b/drivers/power/ds2782_battery.c @@ -163,7 +163,7 @@ static int ds2782_get_capacity(struct ds278x_info *info, int *capacity) if (err) return err; *capacity = raw; - return raw; + return 0; } static int ds2786_get_current(struct ds278x_info *info, int *current_uA) diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index 34d51dd4c53..bed7b4634cc 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -948,8 +948,10 @@ static ssize_t dasd_alias_show(struct device *dev, if (device->discipline && device->discipline->get_uid && !device->discipline->get_uid(device, &uid)) { if (uid.type == UA_BASE_PAV_ALIAS || - uid.type == UA_HYPER_PAV_ALIAS) + uid.type == UA_HYPER_PAV_ALIAS) { + dasd_put_device(device); return sprintf(buf, "1\n"); + } } dasd_put_device(device); diff --git a/drivers/s390/cio/chsc.c b/drivers/s390/cio/chsc.c index ce7cb87479f..407d0e9adfa 100644 --- a/drivers/s390/cio/chsc.c +++ b/drivers/s390/cio/chsc.c @@ -713,7 +713,7 @@ int chsc_determine_base_channel_path_desc(struct chp_id chpid, ret = chsc_determine_channel_path_desc(chpid, 0, 0, 0, 0, chsc_resp); if (ret) goto out_free; - memcpy(desc, &chsc_resp->data, chsc_resp->length); + memcpy(desc, &chsc_resp->data, sizeof(*desc)); out_free: kfree(chsc_resp); return ret; diff --git a/drivers/sbus/char/openprom.c b/drivers/sbus/char/openprom.c index d53e62ab09d..aacbe14e2e7 100644 --- a/drivers/sbus/char/openprom.c +++ b/drivers/sbus/char/openprom.c @@ -554,7 +554,7 @@ static int opiocgetnext(unsigned int cmd, void __user *argp) static int openprom_bsd_ioctl(struct file * file, unsigned int cmd, unsigned long arg) { - DATA *data = (DATA *) file->private_data; + DATA *data = file->private_data; void __user *argp = (void __user *)arg; int err; @@ -601,7 +601,7 @@ static int openprom_bsd_ioctl(struct file * file, static long openprom_ioctl(struct file * file, unsigned int cmd, unsigned long arg) { - DATA *data = (DATA *) file->private_data; + DATA *data = file->private_data; switch (cmd) { case OPROMGETOPT: diff --git a/drivers/serial/suncore.c b/drivers/serial/suncore.c index ed7d958b0a0..544f2e25d0e 100644 --- a/drivers/serial/suncore.c +++ b/drivers/serial/suncore.c @@ -71,7 +71,9 @@ int sunserial_console_match(struct console *con, struct device_node *dp, con->index = line; drv->cons = con; - add_preferred_console(con->name, line, NULL); + + if (!console_set_on_cmdline) + add_preferred_console(con->name, line, NULL); return 1; } diff --git a/drivers/serial/sunsu.c b/drivers/serial/sunsu.c index 234459c2f01..ffbf4553f66 100644 --- a/drivers/serial/sunsu.c +++ b/drivers/serial/sunsu.c @@ -1500,20 +1500,25 @@ out_unmap: static int __devexit su_remove(struct of_device *op) { struct uart_sunsu_port *up = dev_get_drvdata(&op->dev); + bool kbdms = false; if (up->su_type == SU_PORT_MS || - up->su_type == SU_PORT_KBD) { + up->su_type == SU_PORT_KBD) + kbdms = true; + + if (kbdms) { #ifdef CONFIG_SERIO serio_unregister_port(&up->serio); #endif - kfree(up); - } else if (up->port.type != PORT_UNKNOWN) { + } else if (up->port.type != PORT_UNKNOWN) uart_remove_one_port(&sunsu_reg, &up->port); - } if (up->port.membase) of_iounmap(&op->resource[0], up->port.membase, up->reg_size); + if (kbdms) + kfree(up); + dev_set_drvdata(&op->dev, NULL); return 0; diff --git a/drivers/usb/gadget/f_fs.c b/drivers/usb/gadget/f_fs.c index d69eccf5f19..2aaa0f75c6c 100644 --- a/drivers/usb/gadget/f_fs.c +++ b/drivers/usb/gadget/f_fs.c @@ -136,7 +136,7 @@ struct ffs_data { * handling setup requests immidiatelly user space may be so * slow that another setup will be sent to the gadget but this * time not to us but another function and then there could be - * a race. Is taht the case? Or maybe we can use cdev->req + * a race. Is that the case? Or maybe we can use cdev->req * after all, maybe we just need some spinlock for that? */ struct usb_request *ep0req; /* P: mutex */ struct completion ep0req_completion; /* P: mutex */ diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 57a593c58cf..d219070fed3 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -177,8 +177,8 @@ static void handle_tx(struct vhost_net *net) break; } if (err != len) - pr_err("Truncated TX packet: " - " len %d != %zd\n", err, len); + pr_debug("Truncated TX packet: " + " len %d != %zd\n", err, len); vhost_add_used_and_signal(&net->dev, vq, head, 0); total_len += len; if (unlikely(total_len >= VHOST_NET_WEIGHT)) { @@ -275,8 +275,8 @@ static void handle_rx(struct vhost_net *net) } /* TODO: Should check and handle checksum. */ if (err > len) { - pr_err("Discarded truncated rx packet: " - " len %d > %zd\n", err, len); + pr_debug("Discarded truncated rx packet: " + " len %d > %zd\n", err, len); vhost_discard_vq_desc(vq); continue; } @@ -534,11 +534,16 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) rcu_assign_pointer(vq->private_data, sock); vhost_net_enable_vq(n, vq); done: + mutex_unlock(&vq->mutex); + if (oldsock) { vhost_net_flush_vq(n, index); fput(oldsock->file); } + mutex_unlock(&n->dev.mutex); + return 0; + err_vq: mutex_unlock(&vq->mutex); err: diff --git a/drivers/video/aty/radeon_pm.c b/drivers/video/aty/radeon_pm.c index 515cf1978d1..c4e17642d9c 100644 --- a/drivers/video/aty/radeon_pm.c +++ b/drivers/video/aty/radeon_pm.c @@ -2872,7 +2872,7 @@ void radeonfb_pm_init(struct radeonfb_info *rinfo, int dynclk, int ignore_devlis } #if 0 - /* Power down TV DAC, taht saves a significant amount of power, + /* Power down TV DAC, that saves a significant amount of power, * we'll have something better once we actually have some TVOut * support */ diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0d1d966b0fe..c3df14ce2cc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2304,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, return ret; } +/* + * min slot controls the lowest index we're willing to push to the + * right. We'll push up to and including min_slot, but no lower + */ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *right, - int free_space, u32 left_nritems) + int free_space, u32 left_nritems, + u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *upper = path->nodes[1]; @@ -2327,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (empty) nr = 0; else - nr = 1; + nr = max_t(u32, 1, min_slot); if (path->slots[0] >= left_nritems) push_space += data_size; @@ -2469,10 +2474,14 @@ out_unlock: * * returns 1 if the push failed because the other node didn't have enough * room, 0 if everything worked out and < 0 if there were major errors. + * + * this will push starting from min_slot to the end of the leaf. It won't + * push any slot lower than min_slot */ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) + *root, struct btrfs_path *path, + int min_data_size, int data_size, + int empty, u32 min_slot) { struct extent_buffer *left = path->nodes[0]; struct extent_buffer *right; @@ -2514,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; - return __push_leaf_right(trans, root, path, data_size, empty, - right, free_space, left_nritems); + return __push_leaf_right(trans, root, path, min_data_size, empty, + right, free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); free_extent_buffer(right); @@ -2525,12 +2534,17 @@ out_unlock: /* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the + * items */ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *left, - int free_space, int right_nritems) + int free_space, u32 right_nritems, + u32 max_slot) { struct btrfs_disk_key disk_key; struct extent_buffer *right = path->nodes[0]; @@ -2549,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, slot = path->slots[1]; if (empty) - nr = right_nritems; + nr = min(right_nritems, max_slot); else - nr = right_nritems - 1; + nr = min(right_nritems - 1, max_slot); for (i = 0; i < nr; i++) { item = btrfs_item_nr(right, i); @@ -2712,10 +2726,14 @@ out: /* * push some data in the path leaf to the left, trying to free up at * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * max_slot can put a limit on how far into the leaf we'll push items. The + * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the + * items */ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, int data_size, - int empty) + *root, struct btrfs_path *path, int min_data_size, + int data_size, int empty, u32 max_slot) { struct extent_buffer *right = path->nodes[0]; struct extent_buffer *left; @@ -2761,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - return __push_leaf_left(trans, root, path, data_size, - empty, left, free_space, right_nritems); + return __push_leaf_left(trans, root, path, min_data_size, + empty, left, free_space, right_nritems, + max_slot); out: btrfs_tree_unlock(left); free_extent_buffer(left); @@ -2855,6 +2874,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, } /* + * double splits happen when we need to insert a big item in the middle + * of a leaf. A double split can leave us with 3 mostly empty leaves: + * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ] + * A B C + * + * We avoid this by trying to push the items on either side of our target + * into the adjacent leaves. If all goes well we can avoid the double split + * completely. + */ +static noinline int push_for_double_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int data_size) +{ + int ret; + int progress = 0; + int slot; + u32 nritems; + + slot = path->slots[0]; + + /* + * try to push all the items after our slot into the + * right leaf + */ + ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * our goal is to get our slot at the start or end of a leaf. If + * we've done so we're done + */ + if (path->slots[0] == 0 || path->slots[0] == nritems) + return 0; + + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + + /* try to push all the items before our slot into the next leaf */ + slot = path->slots[0]; + ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot); + if (ret < 0) + return ret; + + if (ret == 0) + progress++; + + if (progress) + return 0; + return 1; +} + +/* * split the path's leaf in two, making sure there is at least data_size * available for the resulting leaf level of the path. * @@ -2876,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int wret; int split; int num_doubles = 0; + int tried_avoid_double = 0; l = path->nodes[0]; slot = path->slots[0]; @@ -2884,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, return -EOVERFLOW; /* first try to make some room by pushing left and right */ - if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { - wret = push_leaf_right(trans, root, path, data_size, 0); + if (data_size) { + wret = push_leaf_right(trans, root, path, data_size, + data_size, 0, 0); if (wret < 0) return wret; if (wret) { - wret = push_leaf_left(trans, root, path, data_size, 0); + wret = push_leaf_left(trans, root, path, data_size, + data_size, 0, (u32)-1); if (wret < 0) return wret; } @@ -2923,6 +3003,8 @@ again: if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + if (data_size && !tried_avoid_double) + goto push_for_double; split = 2; } } @@ -2939,6 +3021,8 @@ again: if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + if (data_size && !tried_avoid_double) + goto push_for_double; split = 2 ; } } @@ -3019,6 +3103,13 @@ again: } return ret; + +push_for_double: + push_for_double_split(trans, root, path, data_size); + tried_avoid_double = 1; + if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + return 0; + goto again; } static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, @@ -3915,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, extent_buffer_get(leaf); btrfs_set_path_blocking(path); - wret = push_leaf_left(trans, root, path, 1, 1); + wret = push_leaf_left(trans, root, path, 1, 1, + 1, (u32)-1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (path->nodes[0] == leaf && btrfs_header_nritems(leaf)) { - wret = push_leaf_right(trans, root, path, 1, 1); + wret = push_leaf_right(trans, root, path, 1, + 1, 1, 0); if (wret < 0 && wret != -ENOSPC) ret = wret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4dbaf89b133..9254b3d58db 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1458,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, */ /* the destination must be opened for writing */ - if (!(file->f_mode & FMODE_WRITE)) + if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) return -EINVAL; ret = mnt_want_write(file->f_path.mnt); @@ -1511,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, /* determine range to clone */ ret = -EINVAL; - if (off >= src->i_size || off + len > src->i_size) + if (off + len > src->i_size || off + len < off) goto out_unlock; if (len == 0) olen = len = src->i_size - off; @@ -1578,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 disko = 0, diskl = 0; u64 datao = 0, datal = 0; u8 comp; + u64 endoff; size = btrfs_item_size_nr(leaf, slot); read_extent_buffer(leaf, buf, @@ -1712,9 +1713,18 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_release_path(root, path); inode->i_mtime = inode->i_ctime = CURRENT_TIME; - if (new_key.offset + datal > inode->i_size) - btrfs_i_size_write(inode, - new_key.offset + datal); + + /* + * we round up to the block size at eof when + * determining which extents to clone above, + * but shouldn't round up the file size + */ + endoff = new_key.offset + datal; + if (endoff > off+olen) + endoff = off+olen; + if (endoff > inode->i_size) + btrfs_i_size_write(inode, endoff); + BTRFS_I(inode)->flags = BTRFS_I(src)->flags; ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index 3fe49042d8a..6d44053ecff 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -613,6 +613,9 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) remove_ticket_handler(ac, th); } + if (xi->auth_authorizer.buf) + ceph_buffer_put(xi->auth_authorizer.buf); + kfree(ac->private); ac->private = NULL; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3ab79f6c4ce..416c08d315d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1514,6 +1514,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, ceph_encode_filepath(&p, end, ino1, path1); ceph_encode_filepath(&p, end, ino2, path2); + /* make note of release offset, in case we need to replay */ + req->r_request_release_offset = p - msg->front.iov_base; + /* cap releases */ releases = 0; if (req->r_inode_drop) @@ -1580,6 +1583,32 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); + if (req->r_got_unsafe) { + /* + * Replay. Do not regenerate message (and rebuild + * paths, etc.); just use the original message. + * Rebuilding paths will break for renames because + * d_move mangles the src name. + */ + msg = req->r_request; + rhead = msg->front.iov_base; + + flags = le32_to_cpu(rhead->flags); + flags |= CEPH_MDS_FLAG_REPLAY; + rhead->flags = cpu_to_le32(flags); + + if (req->r_target_inode) + rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); + + rhead->num_retry = req->r_attempts - 1; + + /* remove cap/dentry releases from message */ + rhead->num_releases = 0; + msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset); + msg->front.iov_len = req->r_request_release_offset; + return 0; + } + if (req->r_request) { ceph_msg_put(req->r_request); req->r_request = NULL; @@ -1601,13 +1630,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, rhead->flags = cpu_to_le32(flags); rhead->num_fwd = req->r_num_fwd; rhead->num_retry = req->r_attempts - 1; + rhead->ino = 0; dout(" r_locked_dir = %p\n", req->r_locked_dir); - - if (req->r_target_inode && req->r_got_unsafe) - rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode)); - else - rhead->ino = 0; return 0; } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index b292fa42a66..952410c60d0 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -188,6 +188,7 @@ struct ceph_mds_request { int r_old_inode_drop, r_old_inode_unless; struct ceph_msg *r_request; /* original request */ + int r_request_release_offset; struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; int r_err; diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 9ad43a310a4..15167b2daa5 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -43,7 +43,8 @@ static void ceph_fault(struct ceph_connection *con); * nicely render a sockaddr as a string. */ #define MAX_ADDR_STR 20 -static char addr_str[MAX_ADDR_STR][40]; +#define MAX_ADDR_STR_LEN 60 +static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN]; static DEFINE_SPINLOCK(addr_str_lock); static int last_addr_str; @@ -52,7 +53,6 @@ const char *pr_addr(const struct sockaddr_storage *ss) int i; char *s; struct sockaddr_in *in4 = (void *)ss; - unsigned char *quad = (void *)&in4->sin_addr.s_addr; struct sockaddr_in6 *in6 = (void *)ss; spin_lock(&addr_str_lock); @@ -64,25 +64,13 @@ const char *pr_addr(const struct sockaddr_storage *ss) switch (ss->ss_family) { case AF_INET: - sprintf(s, "%u.%u.%u.%u:%u", - (unsigned int)quad[0], - (unsigned int)quad[1], - (unsigned int)quad[2], - (unsigned int)quad[3], - (unsigned int)ntohs(in4->sin_port)); + snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr, + (unsigned int)ntohs(in4->sin_port)); break; case AF_INET6: - sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u", - in6->sin6_addr.s6_addr16[0], - in6->sin6_addr.s6_addr16[1], - in6->sin6_addr.s6_addr16[2], - in6->sin6_addr.s6_addr16[3], - in6->sin6_addr.s6_addr16[4], - in6->sin6_addr.s6_addr16[5], - in6->sin6_addr.s6_addr16[6], - in6->sin6_addr.s6_addr16[7], - (unsigned int)ntohs(in6->sin6_port)); + snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr, + (unsigned int)ntohs(in6->sin6_port)); break; default: @@ -215,12 +203,13 @@ static void set_sock_callbacks(struct socket *sock, */ static struct socket *ceph_tcp_connect(struct ceph_connection *con) { - struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr; + struct sockaddr_storage *paddr = &con->peer_addr.in_addr; struct socket *sock; int ret; BUG_ON(con->sock); - ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); if (ret) return ERR_PTR(ret); con->sock = sock; @@ -234,7 +223,8 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); - ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK); + ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), + O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", pr_addr(&con->peer_addr.in_addr), @@ -1009,19 +999,32 @@ int ceph_parse_ips(const char *c, const char *end, struct sockaddr_in *in4 = (void *)ss; struct sockaddr_in6 *in6 = (void *)ss; int port; + char delim = ','; + + if (*p == '[') { + delim = ']'; + p++; + } memset(ss, 0, sizeof(*ss)); if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr, - ',', &ipend)) { + delim, &ipend)) ss->ss_family = AF_INET; - } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, - ',', &ipend)) { + else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr, + delim, &ipend)) ss->ss_family = AF_INET6; - } else { + else goto bad; - } p = ipend; + if (delim == ']') { + if (*p != ']') { + dout("missing matching ']'\n"); + goto bad; + } + p++; + } + /* port? */ if (p < end && *p == ':') { port = 0; @@ -1055,7 +1058,7 @@ int ceph_parse_ips(const char *c, const char *end, return 0; bad: - pr_err("parse_ips bad ip '%s'\n", c); + pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); return -EINVAL; } @@ -2015,20 +2018,20 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) { mutex_lock(&con->mutex); if (!list_empty(&msg->list_head)) { - dout("con_revoke %p msg %p\n", con, msg); + dout("con_revoke %p msg %p - was on queue\n", con, msg); list_del_init(&msg->list_head); ceph_msg_put(msg); msg->hdr.seq = 0; - if (con->out_msg == msg) { - ceph_msg_put(con->out_msg); - con->out_msg = NULL; - } + } + if (con->out_msg == msg) { + dout("con_revoke %p msg %p - was sending\n", con, msg); + con->out_msg = NULL; if (con->out_kvec_is_msg) { con->out_skip = con->out_kvec_bytes; con->out_kvec_is_msg = false; } - } else { - dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg); + ceph_msg_put(msg); + msg->hdr.seq = 0; } mutex_unlock(&con->mutex); } diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index 50ce64ebd33..277f8b33957 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -568,6 +568,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) if (ev > CEPH_PG_POOL_VERSION) { pr_warning("got unknown v %d > %d of ceph_pg_pool\n", ev, CEPH_PG_POOL_VERSION); + kfree(pi); goto bad; } __decode_pool(p, pi); diff --git a/fs/dcache.c b/fs/dcache.c index c8c78ba0782..86d4db15473 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -896,7 +896,7 @@ EXPORT_SYMBOL(shrink_dcache_parent); * * In this case we return -1 to tell the caller that we baled. */ -static int shrink_dcache_memory(int nr, gfp_t gfp_mask) +static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { if (!(gfp_mask & __GFP_FS)) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index dbab3fdc258..0898f3ec821 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1358,7 +1358,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) } -static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) +static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { struct gfs2_glock *gl; int may_demote; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index b256d6f2428..8f02d3db8f4 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list); static atomic_t qd_lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(qd_lru_lock); -int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask) +int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { struct gfs2_quota_data *qd; struct gfs2_sbd *sdp; diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 195f60c8bd1..e7d236ca48b 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) return ret; } -extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask); +extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask); extern const struct quotactl_ops gfs2_quotactl_ops; #endif /* __QUOTA_DOT_H__ */ diff --git a/fs/inode.c b/fs/inode.c index 2bee20ae3d6..722860b323a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -512,7 +512,7 @@ static void prune_icache(int nr_to_scan) * This function is passed the number of inodes to scan, and it returns the * total number of remaining possibly-reclaimable inodes. */ -static int shrink_icache_memory(int nr, gfp_t gfp_mask) +static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { /* diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index bc2ff593276..036880895bf 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -297,7 +297,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); - struct jbd2_buffer_trigger_type *triggers; journal_t *journal = transaction->t_journal; /* @@ -328,21 +327,21 @@ repeat: done_copy_out = 1; new_page = virt_to_page(jh_in->b_frozen_data); new_offset = offset_in_page(jh_in->b_frozen_data); - triggers = jh_in->b_frozen_triggers; } else { new_page = jh2bh(jh_in)->b_page; new_offset = offset_in_page(jh2bh(jh_in)->b_data); - triggers = jh_in->b_triggers; } mapped_data = kmap_atomic(new_page, KM_USER0); /* - * Fire any commit trigger. Do this before checking for escaping, - * as the trigger may modify the magic offset. If a copy-out - * happens afterwards, it will have the correct data in the buffer. + * Fire data frozen trigger if data already wasn't frozen. Do this + * before checking for escaping, as the trigger may modify the magic + * offset. If a copy-out happens afterwards, it will have the correct + * data in the buffer. */ - jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset, - triggers); + if (!done_copy_out) + jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset, + jh_in->b_triggers); /* * Check for escaping diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e214d68620a..b8e0806681b 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -725,6 +725,9 @@ done: page = jh2bh(jh)->b_page; offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK; source = kmap_atomic(page, KM_USER0); + /* Fire data frozen trigger just before we copy the data */ + jbd2_buffer_frozen_trigger(jh, source + offset, + jh->b_triggers); memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); kunmap_atomic(source, KM_USER0); @@ -963,15 +966,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh, jh->b_triggers = type; } -void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data, +void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, struct jbd2_buffer_trigger_type *triggers) { struct buffer_head *bh = jh2bh(jh); - if (!triggers || !triggers->t_commit) + if (!triggers || !triggers->t_frozen) return; - triggers->t_commit(triggers, bh, mapped_data, bh->b_size); + triggers->t_frozen(triggers, bh, mapped_data, bh->b_size); } void jbd2_buffer_abort_trigger(struct journal_head *jh, diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index a2d58c96f1b..d258e261bdc 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -626,7 +626,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) { - /* success of check_xattr_ref_inode() means taht inode (ic) dose not have + /* success of check_xattr_ref_inode() means that inode (ic) dose not have * duplicate name/value pairs. If duplicate name/value pair would be found, * one will be removed. */ diff --git a/fs/mbcache.c b/fs/mbcache.c index ec88ff3d04a..e28f21b9534 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -115,7 +115,7 @@ mb_cache_indexes(struct mb_cache *cache) * What the mbcache registers as to get shrunk dynamically. */ -static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask); +static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); static struct shrinker mb_cache_shrinker = { .shrink = mb_cache_shrink_fn, @@ -191,13 +191,14 @@ forget: * This function is called by the kernel memory management when memory * gets low. * + * @shrink: (ignored) * @nr_to_scan: Number of objects to scan * @gfp_mask: (ignored) * * Returns the number of objects which are present in the cache. */ static int -mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask) +mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(free_list); struct list_head *l, *ltmp; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 782b431ef91..e60416d3f81 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1710,7 +1710,7 @@ static void nfs_access_free_list(struct list_head *head) } } -int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) +int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(head); struct nfs_inode *nfsi; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index d8bd619e386..e70f44b9b3f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -205,7 +205,8 @@ extern struct rpc_procinfo nfs4_procedures[]; void nfs_close_context(struct nfs_open_context *ctx, int is_sync); /* dir.c */ -extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); +extern int nfs_access_cache_shrinker(struct shrinker *shrink, + int nr_to_scan, gfp_t gfp_mask); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3623ca20cc1..356e976772b 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock, dump_stack(); goto bail; } - - past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); - mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, - (unsigned long long)past_eof); - - if (create && (iblock >= past_eof)) - set_buffer_new(bh_result); } + past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); + mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, + (unsigned long long)past_eof); + if (create && (iblock >= past_eof)) + set_buffer_new(bh_result); + bail: if (err < 0) err = -EIO; @@ -459,36 +458,6 @@ int walk_page_buffers( handle_t *handle, return ret; } -handle_t *ocfs2_start_walk_page_trans(struct inode *inode, - struct page *page, - unsigned from, - unsigned to) -{ - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - handle_t *handle; - int ret = 0; - - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; - } - - if (ocfs2_should_order_data(inode)) { - ret = ocfs2_jbd2_file_inode(handle, inode); - if (ret < 0) - mlog_errno(ret); - } -out: - if (ret) { - if (!IS_ERR(handle)) - ocfs2_commit_trans(osb, handle); - handle = ERR_PTR(ret); - } - return handle; -} - static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) { sector_t status; @@ -1131,23 +1100,37 @@ out: */ static int ocfs2_grab_pages_for_write(struct address_space *mapping, struct ocfs2_write_ctxt *wc, - u32 cpos, loff_t user_pos, int new, + u32 cpos, loff_t user_pos, + unsigned user_len, int new, struct page *mmap_page) { int ret = 0, i; - unsigned long start, target_index, index; + unsigned long start, target_index, end_index, index; struct inode *inode = mapping->host; + loff_t last_byte; target_index = user_pos >> PAGE_CACHE_SHIFT; /* * Figure out how many pages we'll be manipulating here. For * non allocating write, we just change the one - * page. Otherwise, we'll need a whole clusters worth. + * page. Otherwise, we'll need a whole clusters worth. If we're + * writing past i_size, we only need enough pages to cover the + * last page of the write. */ if (new) { wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); + /* + * We need the index *past* the last page we could possibly + * touch. This is the page past the end of the write or + * i_size, whichever is greater. + */ + last_byte = max(user_pos + user_len, i_size_read(inode)); + BUG_ON(last_byte < 1); + end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1; + if ((start + wc->w_num_pages) > end_index) + wc->w_num_pages = end_index - start; } else { wc->w_num_pages = 1; start = target_index; @@ -1620,21 +1603,20 @@ out: * write path can treat it as an non-allocating write, which has no * special case code for sparse/nonsparse files. */ -static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, - unsigned len, +static int ocfs2_expand_nonsparse_inode(struct inode *inode, + struct buffer_head *di_bh, + loff_t pos, unsigned len, struct ocfs2_write_ctxt *wc) { int ret; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); loff_t newsize = pos + len; - if (ocfs2_sparse_alloc(osb)) - return 0; + BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); if (newsize <= i_size_read(inode)) return 0; - ret = ocfs2_extend_no_holes(inode, newsize, pos); + ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos); if (ret) mlog_errno(ret); @@ -1644,6 +1626,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, return ret; } +static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh, + loff_t pos) +{ + int ret = 0; + + BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))); + if (pos > i_size_read(inode)) + ret = ocfs2_zero_extend(inode, di_bh, pos); + + return ret; +} + int ocfs2_write_begin_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata, @@ -1679,7 +1673,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, } } - ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc); + if (ocfs2_sparse_alloc(osb)) + ret = ocfs2_zero_tail(inode, di_bh, pos); + else + ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len, + wc); if (ret) { mlog_errno(ret); goto out; @@ -1789,7 +1787,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * that we can zero and flush if we error after adding the * extent. */ - ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, + ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, cluster_of_pages, mmap_page); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 6b5a492e174..153abb5abef 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1671,7 +1671,7 @@ struct dlm_ctxt * dlm_register_domain(const char *domain, struct dlm_ctxt *dlm = NULL; struct dlm_ctxt *new_ctxt = NULL; - if (strlen(domain) > O2NM_MAX_NAME_LEN) { + if (strlen(domain) >= O2NM_MAX_NAME_LEN) { ret = -ENAMETOOLONG; mlog(ML_ERROR, "domain name length too long\n"); goto leave; @@ -1709,6 +1709,7 @@ retry: } if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) { + spin_unlock(&dlm_domain_lock); mlog(ML_ERROR, "Requested locking protocol version is not " "compatible with already registered domain " diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 4a7506a4e31..94b97fc6a88 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2808,14 +2808,8 @@ again: mlog(0, "trying again...\n"); goto again; } - /* now that we are sure the MIGRATING state is there, drop - * the unneded state which blocked threads trying to DIRTY */ - spin_lock(&res->spinlock); - BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); - BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); - res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; - spin_unlock(&res->spinlock); + ret = 0; /* did the target go down or die? */ spin_lock(&dlm->spinlock); if (!test_bit(target, dlm->domain_map)) { @@ -2826,9 +2820,21 @@ again: spin_unlock(&dlm->spinlock); /* + * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for + * another try; otherwise, we are sure the MIGRATING state is there, + * drop the unneded state which blocked threads trying to DIRTY + */ + spin_lock(&res->spinlock); + BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); + res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; + if (!ret) + BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); + spin_unlock(&res->spinlock); + + /* * at this point: * - * o the DLM_LOCK_RES_MIGRATING flag is set + * o the DLM_LOCK_RES_MIGRATING flag is set if target not down * o there are no pending asts on this lockres * o all processes trying to reserve an ast on this * lockres must wait for the MIGRATING flag to clear diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index f8b75ce4be7..9dfaac73b36 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -463,7 +463,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { int bit; - bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); + bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0); if (bit >= O2NM_MAX_NODES || bit < 0) dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); else diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6a13ea64c44..2b10b36d157 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -724,28 +724,55 @@ leave: return status; } +/* + * While a write will already be ordering the data, a truncate will not. + * Thus, we need to explicitly order the zeroed pages. + */ +static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle = NULL; + int ret = 0; + + if (!ocfs2_should_order_data(inode)) + goto out; + + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_jbd2_file_inode(handle, inode); + if (ret < 0) + mlog_errno(ret); + +out: + if (ret) { + if (!IS_ERR(handle)) + ocfs2_commit_trans(osb, handle); + handle = ERR_PTR(ret); + } + return handle; +} + /* Some parts of this taken from generic_cont_expand, which turned out * to be too fragile to do exactly what we need without us having to * worry about recursive locking in ->write_begin() and ->write_end(). */ -static int ocfs2_write_zero_page(struct inode *inode, - u64 size) +static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, + u64 abs_to) { struct address_space *mapping = inode->i_mapping; struct page *page; - unsigned long index; - unsigned int offset; + unsigned long index = abs_from >> PAGE_CACHE_SHIFT; handle_t *handle = NULL; - int ret; + int ret = 0; + unsigned zero_from, zero_to, block_start, block_end; - offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ - /* ugh. in prepare/commit_write, if from==to==start of block, we - ** skip the prepare. make sure we never send an offset for the start - ** of a block - */ - if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { - offset++; - } - index = size >> PAGE_CACHE_SHIFT; + BUG_ON(abs_from >= abs_to); + BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); + BUG_ON(abs_from & (inode->i_blkbits - 1)); page = grab_cache_page(mapping, index); if (!page) { @@ -754,31 +781,56 @@ static int ocfs2_write_zero_page(struct inode *inode, goto out; } - ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); - if (ret < 0) { - mlog_errno(ret); - goto out_unlock; - } + /* Get the offsets within the page that we want to zero */ + zero_from = abs_from & (PAGE_CACHE_SIZE - 1); + zero_to = abs_to & (PAGE_CACHE_SIZE - 1); + if (!zero_to) + zero_to = PAGE_CACHE_SIZE; - if (ocfs2_should_order_data(inode)) { - handle = ocfs2_start_walk_page_trans(inode, page, offset, - offset); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; + mlog(0, + "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n", + (unsigned long long)abs_from, (unsigned long long)abs_to, + index, zero_from, zero_to); + + /* We know that zero_from is block aligned */ + for (block_start = zero_from; block_start < zero_to; + block_start = block_end) { + block_end = block_start + (1 << inode->i_blkbits); + + /* + * block_start is block-aligned. Bump it by one to + * force ocfs2_{prepare,commit}_write() to zero the + * whole block. + */ + ret = ocfs2_prepare_write_nolock(inode, page, + block_start + 1, + block_start + 1); + if (ret < 0) { + mlog_errno(ret); goto out_unlock; } - } - /* must not update i_size! */ - ret = block_commit_write(page, offset, offset); - if (ret < 0) - mlog_errno(ret); - else - ret = 0; + if (!handle) { + handle = ocfs2_zero_start_ordered_transaction(inode); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + break; + } + } + + /* must not update i_size! */ + ret = block_commit_write(page, block_start + 1, + block_start + 1); + if (ret < 0) + mlog_errno(ret); + else + ret = 0; + } if (handle) ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + out_unlock: unlock_page(page); page_cache_release(page); @@ -786,22 +838,114 @@ out: return ret; } -static int ocfs2_zero_extend(struct inode *inode, - u64 zero_to_size) +/* + * Find the next range to zero. We do this in terms of bytes because + * that's what ocfs2_zero_extend() wants, and it is dealing with the + * pagecache. We may return multiple extents. + * + * zero_start and zero_end are ocfs2_zero_extend()s current idea of what + * needs to be zeroed. range_start and range_end return the next zeroing + * range. A subsequent call should pass the previous range_end as its + * zero_start. If range_end is 0, there's nothing to do. + * + * Unwritten extents are skipped over. Refcounted extents are CoWd. + */ +static int ocfs2_zero_extend_get_range(struct inode *inode, + struct buffer_head *di_bh, + u64 zero_start, u64 zero_end, + u64 *range_start, u64 *range_end) { - int ret = 0; - u64 start_off; - struct super_block *sb = inode->i_sb; + int rc = 0, needs_cow = 0; + u32 p_cpos, zero_clusters = 0; + u32 zero_cpos = + zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end); + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; - start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); - while (start_off < zero_to_size) { - ret = ocfs2_write_zero_page(inode, start_off); - if (ret < 0) { - mlog_errno(ret); + while (zero_cpos < last_cpos) { + rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos, + &num_clusters, &ext_flags); + if (rc) { + mlog_errno(rc); + goto out; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { + zero_clusters = num_clusters; + if (ext_flags & OCFS2_EXT_REFCOUNTED) + needs_cow = 1; + break; + } + + zero_cpos += num_clusters; + } + if (!zero_clusters) { + *range_end = 0; + goto out; + } + + while ((zero_cpos + zero_clusters) < last_cpos) { + rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters, + &p_cpos, &num_clusters, + &ext_flags); + if (rc) { + mlog_errno(rc); goto out; } - start_off += sb->s_blocksize; + if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)) + break; + if (ext_flags & OCFS2_EXT_REFCOUNTED) + needs_cow = 1; + zero_clusters += num_clusters; + } + if ((zero_cpos + zero_clusters) > last_cpos) + zero_clusters = last_cpos - zero_cpos; + + if (needs_cow) { + rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters, + UINT_MAX); + if (rc) { + mlog_errno(rc); + goto out; + } + } + + *range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos); + *range_end = ocfs2_clusters_to_bytes(inode->i_sb, + zero_cpos + zero_clusters); + +out: + return rc; +} + +/* + * Zero one range returned from ocfs2_zero_extend_get_range(). The caller + * has made sure that the entire range needs zeroing. + */ +static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start, + u64 range_end) +{ + int rc = 0; + u64 next_pos; + u64 zero_pos = range_start; + + mlog(0, "range_start = %llu, range_end = %llu\n", + (unsigned long long)range_start, + (unsigned long long)range_end); + BUG_ON(range_start >= range_end); + + while (zero_pos < range_end) { + next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE; + if (next_pos > range_end) + next_pos = range_end; + rc = ocfs2_write_zero_page(inode, zero_pos, next_pos); + if (rc < 0) { + mlog_errno(rc); + break; + } + zero_pos = next_pos; /* * Very large extends have the potential to lock up @@ -810,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode, cond_resched(); } -out: + return rc; +} + +int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, + loff_t zero_to_size) +{ + int ret = 0; + u64 zero_start, range_start = 0, range_end = 0; + struct super_block *sb = inode->i_sb; + + zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); + mlog(0, "zero_start %llu for i_size %llu\n", + (unsigned long long)zero_start, + (unsigned long long)i_size_read(inode)); + while (zero_start < zero_to_size) { + ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start, + zero_to_size, + &range_start, + &range_end); + if (ret) { + mlog_errno(ret); + break; + } + if (!range_end) + break; + /* Trim the ends */ + if (range_start < zero_start) + range_start = zero_start; + if (range_end > zero_to_size) + range_end = zero_to_size; + + ret = ocfs2_zero_extend_range(inode, range_start, + range_end); + if (ret) { + mlog_errno(ret); + break; + } + zero_start = range_end; + } + return ret; } -int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) +int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, + u64 new_i_size, u64 zero_to) { int ret; u32 clusters_to_add; struct ocfs2_inode_info *oi = OCFS2_I(inode); + /* + * Only quota files call this without a bh, and they can't be + * refcounted. + */ + BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE)); + clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); if (clusters_to_add < oi->ip_clusters) clusters_to_add = 0; @@ -840,7 +1031,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) * still need to zero the area between the old i_size and the * new i_size. */ - ret = ocfs2_zero_extend(inode, zero_to); + ret = ocfs2_zero_extend(inode, di_bh, zero_to); if (ret < 0) mlog_errno(ret); @@ -862,27 +1053,15 @@ static int ocfs2_extend_file(struct inode *inode, goto out; if (i_size_read(inode) == new_i_size) - goto out; + goto out; BUG_ON(new_i_size < i_size_read(inode)); /* - * Fall through for converting inline data, even if the fs - * supports sparse files. - * - * The check for inline data here is legal - nobody can add - * the feature since we have i_mutex. We must check it again - * after acquiring ip_alloc_sem though, as paths like mmap - * might have raced us to converting the inode to extents. - */ - if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) - && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) - goto out_update_size; - - /* * The alloc sem blocks people in read/write from reading our * allocation until we're done changing it. We depend on * i_mutex to block other extend/truncate calls while we're - * here. + * here. We even have to hold it for sparse files because there + * might be some tail zeroing. */ down_write(&oi->ip_alloc_sem); @@ -899,14 +1078,16 @@ static int ocfs2_extend_file(struct inode *inode, ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); if (ret) { up_write(&oi->ip_alloc_sem); - mlog_errno(ret); goto out; } } - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) - ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); + if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) + ret = ocfs2_zero_extend(inode, di_bh, new_i_size); + else + ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size, + new_i_size); up_write(&oi->ip_alloc_sem); diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index d66cf4f7c70..97bf761c9e7 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, int ocfs2_simple_size_update(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size); -int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, - u64 zero_to); +int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, + u64 new_i_size, u64 zero_to); +int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, + loff_t zero_to); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 47878cf1641..625de9d7088 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -472,7 +472,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger return container_of(triggers, struct ocfs2_triggers, ot_triggers); } -static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers, +static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { @@ -491,7 +491,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers, * Quota blocks have their own trigger because the struct ocfs2_block_check * offset depends on the blocksize. */ -static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers, +static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { @@ -511,7 +511,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers, * Directory blocks also have their own trigger because the * struct ocfs2_block_check offset depends on the blocksize. */ -static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers, +static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size) { @@ -544,7 +544,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, static struct ocfs2_triggers di_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_dinode, i_check), @@ -552,7 +552,7 @@ static struct ocfs2_triggers di_triggers = { static struct ocfs2_triggers eb_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_extent_block, h_check), @@ -560,7 +560,7 @@ static struct ocfs2_triggers eb_triggers = { static struct ocfs2_triggers rb_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_refcount_block, rf_check), @@ -568,7 +568,7 @@ static struct ocfs2_triggers rb_triggers = { static struct ocfs2_triggers gd_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), @@ -576,14 +576,14 @@ static struct ocfs2_triggers gd_triggers = { static struct ocfs2_triggers db_triggers = { .ot_triggers = { - .t_commit = ocfs2_db_commit_trigger, + .t_frozen = ocfs2_db_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, }; static struct ocfs2_triggers xb_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), @@ -591,14 +591,14 @@ static struct ocfs2_triggers xb_triggers = { static struct ocfs2_triggers dq_triggers = { .ot_triggers = { - .t_commit = ocfs2_dq_commit_trigger, + .t_frozen = ocfs2_dq_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, }; static struct ocfs2_triggers dr_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_dx_root_block, dr_check), @@ -606,7 +606,7 @@ static struct ocfs2_triggers dr_triggers = { static struct ocfs2_triggers dl_triggers = { .ot_triggers = { - .t_commit = ocfs2_commit_trigger, + .t_frozen = ocfs2_frozen_trigger, .t_abort = ocfs2_abort_trigger, }, .ot_offset = offsetof(struct ocfs2_dx_leaf, dl_check), @@ -1936,7 +1936,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work) mutex_lock(&os->os_lock); ocfs2_queue_orphan_scan(osb); if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE) - schedule_delayed_work(&os->os_orphan_scan_work, + queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, ocfs2_orphan_scan_timeout()); mutex_unlock(&os->os_lock); } @@ -1976,8 +1976,8 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE); - schedule_delayed_work(&os->os_orphan_scan_work, - ocfs2_orphan_scan_timeout()); + queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work, + ocfs2_orphan_scan_timeout()); } } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 3d7419682dc..ec6adbf8f55 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -118,6 +118,7 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb) { unsigned int la_mb; unsigned int gd_mb; + unsigned int la_max_mb; unsigned int megs_per_slot; struct super_block *sb = osb->sb; @@ -182,6 +183,12 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb) if (megs_per_slot < la_mb) la_mb = megs_per_slot; + /* We can't store more bits than we can in a block. */ + la_max_mb = ocfs2_clusters_to_megabytes(osb->sb, + ocfs2_local_alloc_size(sb) * 8); + if (la_mb > la_max_mb) + la_mb = la_max_mb; + return la_mb; } diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 2bb35fe0051..4607923eb24 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -775,7 +775,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) * locking allocators ranks above a transaction start */ WARN_ON(journal_current_handle()); - status = ocfs2_extend_no_holes(gqinode, + status = ocfs2_extend_no_holes(gqinode, NULL, gqinode->i_size + (need_alloc << sb->s_blocksize_bits), gqinode->i_size); if (status < 0) diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 8bd70d4d184..dc78764ccc4 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -971,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( u64 p_blkno; /* We are protected by dqio_sem so no locking needed */ - status = ocfs2_extend_no_holes(lqinode, + status = ocfs2_extend_no_holes(lqinode, NULL, lqinode->i_size + 2 * sb->s_blocksize, lqinode->i_size); if (status < 0) { @@ -1114,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( return ocfs2_local_quota_add_chunk(sb, type, offset); /* We are protected by dqio_sem so no locking needed */ - status = ocfs2_extend_no_holes(lqinode, + status = ocfs2_extend_no_holes(lqinode, NULL, lqinode->i_size + sb->s_blocksize, lqinode->i_size); if (status < 0) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 4793f36f651..3ac5aa733e9 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2931,6 +2931,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle, offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); + /* + * We only duplicate pages until we reach the page contains i_size - 1. + * So trim 'end' to i_size. + */ + if (end > i_size_read(context->inode)) + end = i_size_read(context->inode); while (offset < end) { page_index = offset >> PAGE_CACHE_SHIFT; @@ -4166,6 +4172,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry, struct inode *inode = old_dentry->d_inode; struct buffer_head *new_bh = NULL; + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) { + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + ret = filemap_fdatawrite(inode->i_mapping); if (ret) { mlog_errno(ret); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index f4c2a9eb8c4..a8e6a95a353 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -741,7 +741,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, le16_to_cpu(bg->bg_free_bits_count)); le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits)); - cl->cl_recs[alloc_rec].c_blkno = cpu_to_le64(bg->bg_blkno); + cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno; if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count)) le16_add_cpu(&cl->cl_next_free_rec, 1); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index e97b34842cf..d03469f6180 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -709,7 +709,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, struct ocfs2_xattr_value_buf *vb, struct ocfs2_xattr_set_ctxt *ctxt) { - int status = 0; + int status = 0, credits; handle_t *handle = ctxt->handle; enum ocfs2_alloc_restarted why; u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); @@ -719,38 +719,54 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode, ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb); - status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } + while (clusters_to_add) { + status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + break; + } - prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); - status = ocfs2_add_clusters_in_btree(handle, - &et, - &logical_start, - clusters_to_add, - 0, - ctxt->data_ac, - ctxt->meta_ac, - &why); - if (status < 0) { - mlog_errno(status); - goto leave; - } + prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); + status = ocfs2_add_clusters_in_btree(handle, + &et, + &logical_start, + clusters_to_add, + 0, + ctxt->data_ac, + ctxt->meta_ac, + &why); + if ((status < 0) && (status != -EAGAIN)) { + if (status != -ENOSPC) + mlog_errno(status); + break; + } - ocfs2_journal_dirty(handle, vb->vb_bh); + ocfs2_journal_dirty(handle, vb->vb_bh); - clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters; + clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - + prev_clusters; - /* - * We should have already allocated enough space before the transaction, - * so no need to restart. - */ - BUG_ON(why != RESTART_NONE || clusters_to_add); - -leave: + if (why != RESTART_NONE && clusters_to_add) { + /* + * We can only fail in case the alloc file doesn't give + * up enough clusters. + */ + BUG_ON(why == RESTART_META); + + mlog(0, "restarting xattr value extension for %u" + " clusters,.\n", clusters_to_add); + credits = ocfs2_calc_extend_credits(inode->i_sb, + &vb->vb_xv->xr_list, + clusters_to_add); + status = ocfs2_extend_trans(handle, credits); + if (status < 0) { + status = -ENOMEM; + mlog_errno(status); + break; + } + } + } return status; } @@ -6788,16 +6804,15 @@ out: return ret; } -static int ocfs2_reflink_xattr_buckets(handle_t *handle, +static int ocfs2_reflink_xattr_bucket(handle_t *handle, u64 blkno, u64 new_blkno, u32 clusters, + u32 *cpos, int num_buckets, struct ocfs2_alloc_context *meta_ac, struct ocfs2_alloc_context *data_ac, struct ocfs2_reflink_xattr_tree_args *args) { int i, j, ret = 0; struct super_block *sb = args->reflink->old_inode->i_sb; - u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb)); - u32 num_buckets = clusters * bpc; int bpb = args->old_bucket->bu_blocks; struct ocfs2_xattr_value_buf vb = { .vb_access = ocfs2_journal_access, @@ -6816,14 +6831,6 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, break; } - /* - * The real bucket num in this series of blocks is stored - * in the 1st bucket. - */ - if (i == 0) - num_buckets = le16_to_cpu( - bucket_xh(args->old_bucket)->xh_num_buckets); - ret = ocfs2_xattr_bucket_journal_access(handle, args->new_bucket, OCFS2_JOURNAL_ACCESS_CREATE); @@ -6837,6 +6844,18 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, bucket_block(args->old_bucket, j), sb->s_blocksize); + /* + * Record the start cpos so that we can use it to initialize + * our xattr tree we also set the xh_num_bucket for the new + * bucket. + */ + if (i == 0) { + *cpos = le32_to_cpu(bucket_xh(args->new_bucket)-> + xh_entries[0].xe_name_hash); + bucket_xh(args->new_bucket)->xh_num_buckets = + cpu_to_le16(num_buckets); + } + ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); ret = ocfs2_reflink_xattr_header(handle, args->reflink, @@ -6866,6 +6885,7 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, } ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket); + ocfs2_xattr_bucket_relse(args->old_bucket); ocfs2_xattr_bucket_relse(args->new_bucket); } @@ -6874,6 +6894,75 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle, ocfs2_xattr_bucket_relse(args->new_bucket); return ret; } + +static int ocfs2_reflink_xattr_buckets(handle_t *handle, + struct inode *inode, + struct ocfs2_reflink_xattr_tree_args *args, + struct ocfs2_extent_tree *et, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac, + u64 blkno, u32 cpos, u32 len) +{ + int ret, first_inserted = 0; + u32 p_cluster, num_clusters, reflink_cpos = 0; + u64 new_blkno; + unsigned int num_buckets, reflink_buckets; + unsigned int bpc = + ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); + + ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno); + if (ret) { + mlog_errno(ret); + goto out; + } + num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets); + ocfs2_xattr_bucket_relse(args->old_bucket); + + while (len && num_buckets) { + ret = ocfs2_claim_clusters(handle, data_ac, + 1, &p_cluster, &num_clusters); + if (ret) { + mlog_errno(ret); + goto out; + } + + new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + reflink_buckets = min(num_buckets, bpc * num_clusters); + + ret = ocfs2_reflink_xattr_bucket(handle, blkno, + new_blkno, num_clusters, + &reflink_cpos, reflink_buckets, + meta_ac, data_ac, args); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * For the 1st allocated cluster, we make it use the same cpos + * so that the xattr tree looks the same as the original one + * in the most case. + */ + if (!first_inserted) { + reflink_cpos = cpos; + first_inserted = 1; + } + ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno, + num_clusters, 0, meta_ac); + if (ret) + mlog_errno(ret); + + mlog(0, "insert new xattr extent rec start %llu len %u to %u\n", + (unsigned long long)new_blkno, num_clusters, reflink_cpos); + + len -= num_clusters; + blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters); + num_buckets -= reflink_buckets; + } +out: + return ret; +} + /* * Create the same xattr extent record in the new inode's xattr tree. */ @@ -6885,8 +6974,6 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode, void *para) { int ret, credits = 0; - u32 p_cluster, num_clusters; - u64 new_blkno; handle_t *handle; struct ocfs2_reflink_xattr_tree_args *args = (struct ocfs2_reflink_xattr_tree_args *)para; @@ -6895,6 +6982,9 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode, struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_extent_tree et; + mlog(0, "reflink xattr buckets %llu len %u\n", + (unsigned long long)blkno, len); + ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(args->reflink->new_inode), args->new_blk_bh); @@ -6914,32 +7004,12 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode, goto out; } - ret = ocfs2_claim_clusters(handle, data_ac, - len, &p_cluster, &num_clusters); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster); - - mlog(0, "reflink xattr buckets %llu to %llu, len %u\n", - (unsigned long long)blkno, (unsigned long long)new_blkno, len); - ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len, - meta_ac, data_ac, args); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - mlog(0, "insert new xattr extent rec start %llu len %u to %u\n", - (unsigned long long)new_blkno, len, cpos); - ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno, - len, 0, meta_ac); + ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et, + meta_ac, data_ac, + blkno, cpos, len); if (ret) mlog_errno(ret); -out_commit: ocfs2_commit_trans(osb, handle); out: diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index 3e73de5967f..fc8497643fd 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -74,6 +74,7 @@ int ibm_partition(struct parsed_partitions *state) } *label; unsigned char *data; Sector sect; + sector_t labelsect; res = 0; blocksize = bdev_logical_block_size(bdev); @@ -98,10 +99,19 @@ int ibm_partition(struct parsed_partitions *state) goto out_freeall; /* + * Special case for FBA disks: label sector does not depend on + * blocksize. + */ + if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) || + (info->cu_type == 0x3880 && info->dev_type == 0x3370)) + labelsect = info->label_block; + else + labelsect = info->label_block * (blocksize >> 9); + + /* * Get volume label, extract name and type. */ - data = read_part_sector(state, info->label_block*(blocksize/512), - §); + data = read_part_sector(state, labelsect, §); if (data == NULL) goto out_readerr; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 12c233da1b6..437d2ca2de9 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -676,7 +676,7 @@ static void prune_dqcache(int count) * This is called from kswapd when we think we need some * more memory */ -static int shrink_dqcache_memory(int nr, gfp_t gfp_mask) +static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask) { if (nr) { spin_lock(&dq_list_lock); diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index 02feb59cefc..0b201114a5a 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -277,7 +277,7 @@ static int kick_a_thread(void) return 0; } -int ubifs_shrinker(int nr, gfp_t gfp_mask) +int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask) { int freed, contention = 0; long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 2eef553d50c..04310878f44 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1575,7 +1575,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot); int ubifs_tnc_end_commit(struct ubifs_info *c); /* shrinker.c */ -int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask); +int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask); /* commit.c */ int ubifs_bg_thread(void *info); diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 649ade8ef59..2ee3f7a6016 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -45,7 +45,7 @@ static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); -STATIC int xfsbufd_wakeup(int, gfp_t); +STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t); STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int); static struct shrinker xfs_buf_shake = { .shrink = xfsbufd_wakeup, @@ -340,7 +340,7 @@ _xfs_buf_lookup_pages( __func__, gfp_mask); XFS_STATS_INC(xb_page_retries); - xfsbufd_wakeup(0, gfp_mask); + xfsbufd_wakeup(NULL, 0, gfp_mask); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } @@ -1762,6 +1762,7 @@ xfs_buf_runall_queues( STATIC int xfsbufd_wakeup( + struct shrinker *shrink, int priority, gfp_t mask) { diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index f2d1718c916..80938c736c2 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1883,7 +1883,6 @@ init_xfs_fs(void) goto out_cleanup_procfs; vfs_initquota(); - xfs_inode_shrinker_init(); error = register_filesystem(&xfs_fs_type); if (error) @@ -1911,7 +1910,6 @@ exit_xfs_fs(void) { vfs_exitquota(); unregister_filesystem(&xfs_fs_type); - xfs_inode_shrinker_destroy(); xfs_sysctl_unregister(); xfs_cleanup_procfs(); xfs_buf_terminate(); diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index ef7f0218bcc..a51a07c3a70 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -144,6 +144,41 @@ restart: return last_error; } +/* + * Select the next per-ag structure to iterate during the walk. The reclaim + * walk is optimised only to walk AGs with reclaimable inodes in them. + */ +static struct xfs_perag * +xfs_inode_ag_iter_next_pag( + struct xfs_mount *mp, + xfs_agnumber_t *first, + int tag) +{ + struct xfs_perag *pag = NULL; + + if (tag == XFS_ICI_RECLAIM_TAG) { + int found; + int ref; + + spin_lock(&mp->m_perag_lock); + found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, + (void **)&pag, *first, 1, tag); + if (found <= 0) { + spin_unlock(&mp->m_perag_lock); + return NULL; + } + *first = pag->pag_agno + 1; + /* open coded pag reference increment */ + ref = atomic_inc_return(&pag->pag_ref); + spin_unlock(&mp->m_perag_lock); + trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_); + } else { + pag = xfs_perag_get(mp, *first); + (*first)++; + } + return pag; +} + int xfs_inode_ag_iterator( struct xfs_mount *mp, @@ -154,16 +189,15 @@ xfs_inode_ag_iterator( int exclusive, int *nr_to_scan) { + struct xfs_perag *pag; int error = 0; int last_error = 0; xfs_agnumber_t ag; int nr; nr = nr_to_scan ? *nr_to_scan : INT_MAX; - for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, ag); + ag = 0; + while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) { error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, exclusive, &nr); xfs_perag_put(pag); @@ -640,6 +674,17 @@ __xfs_inode_set_reclaim_tag( radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), XFS_ICI_RECLAIM_TAG); + + if (!pag->pag_ici_reclaimable) { + /* propagate the reclaim tag up into the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_set(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } pag->pag_ici_reclaimable++; } @@ -674,6 +719,16 @@ __xfs_inode_clear_reclaim_tag( radix_tree_tag_clear(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); pag->pag_ici_reclaimable--; + if (!pag->pag_ici_reclaimable) { + /* clear the reclaim tag from the perag radix tree */ + spin_lock(&ip->i_mount->m_perag_lock); + radix_tree_tag_clear(&ip->i_mount->m_perag_tree, + XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + spin_unlock(&ip->i_mount->m_perag_lock); + trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, + -1, _RET_IP_); + } } /* @@ -828,83 +883,52 @@ xfs_reclaim_inodes( /* * Shrinker infrastructure. - * - * This is all far more complex than it needs to be. It adds a global list of - * mounts because the shrinkers can only call a global context. We need to make - * the shrinkers pass a context to avoid the need for global state. */ -static LIST_HEAD(xfs_mount_list); -static struct rw_semaphore xfs_mount_list_lock; - static int xfs_reclaim_inode_shrink( + struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { struct xfs_mount *mp; struct xfs_perag *pag; xfs_agnumber_t ag; - int reclaimable = 0; + int reclaimable; + mp = container_of(shrink, struct xfs_mount, m_inode_shrink); if (nr_to_scan) { if (!(gfp_mask & __GFP_FS)) return -1; - down_read(&xfs_mount_list_lock); - list_for_each_entry(mp, &xfs_mount_list, m_mplist) { - xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, + xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); - if (nr_to_scan <= 0) - break; - } - up_read(&xfs_mount_list_lock); - } + /* if we don't exhaust the scan, don't bother coming back */ + if (nr_to_scan > 0) + return -1; + } - down_read(&xfs_mount_list_lock); - list_for_each_entry(mp, &xfs_mount_list, m_mplist) { - for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { - pag = xfs_perag_get(mp, ag); - reclaimable += pag->pag_ici_reclaimable; - xfs_perag_put(pag); - } + reclaimable = 0; + ag = 0; + while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, + XFS_ICI_RECLAIM_TAG))) { + reclaimable += pag->pag_ici_reclaimable; + xfs_perag_put(pag); } - up_read(&xfs_mount_list_lock); return reclaimable; } -static struct shrinker xfs_inode_shrinker = { - .shrink = xfs_reclaim_inode_shrink, - .seeks = DEFAULT_SEEKS, -}; - -void __init -xfs_inode_shrinker_init(void) -{ - init_rwsem(&xfs_mount_list_lock); - register_shrinker(&xfs_inode_shrinker); -} - -void -xfs_inode_shrinker_destroy(void) -{ - ASSERT(list_empty(&xfs_mount_list)); - unregister_shrinker(&xfs_inode_shrinker); -} - void xfs_inode_shrinker_register( struct xfs_mount *mp) { - down_write(&xfs_mount_list_lock); - list_add_tail(&mp->m_mplist, &xfs_mount_list); - up_write(&xfs_mount_list_lock); + mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink; + mp->m_inode_shrink.seeks = DEFAULT_SEEKS; + register_shrinker(&mp->m_inode_shrink); } void xfs_inode_shrinker_unregister( struct xfs_mount *mp) { - down_write(&xfs_mount_list_lock); - list_del(&mp->m_mplist); - up_write(&xfs_mount_list_lock); + unregister_shrinker(&mp->m_inode_shrink); } diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index cdcbaaca988..e28139aaa4a 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -55,8 +55,6 @@ int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags, int tag, int write_lock, int *nr_to_scan); -void xfs_inode_shrinker_init(void); -void xfs_inode_shrinker_destroy(void); void xfs_inode_shrinker_register(struct xfs_mount *mp); void xfs_inode_shrinker_unregister(struct xfs_mount *mp); diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index 73d5aa11738..30282069090 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -124,7 +124,10 @@ DEFINE_EVENT(xfs_perag_class, name, \ unsigned long caller_ip), \ TP_ARGS(mp, agno, refcount, caller_ip)) DEFINE_PERAG_REF_EVENT(xfs_perag_get); +DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_put); +DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); TRACE_EVENT(xfs_attr_list_node_descend, TP_PROTO(struct xfs_attr_list_context *ctx, diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 8c117ff2e3a..67c018392d6 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -69,7 +69,7 @@ STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); STATIC int xfs_qm_init_quotainos(xfs_mount_t *); STATIC int xfs_qm_init_quotainfo(xfs_mount_t *); -STATIC int xfs_qm_shake(int, gfp_t); +STATIC int xfs_qm_shake(struct shrinker *, int, gfp_t); static struct shrinker xfs_qm_shaker = { .shrink = xfs_qm_shake, @@ -2117,7 +2117,10 @@ xfs_qm_shake_freelist( */ /* ARGSUSED */ STATIC int -xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask) +xfs_qm_shake( + struct shrinker *shrink, + int nr_to_scan, + gfp_t gfp_mask) { int ndqused, nfree, n; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1d2c7eed4ed..5761087ee8e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -259,7 +259,7 @@ typedef struct xfs_mount { wait_queue_head_t m_wait_single_sync_task; __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ - struct list_head m_mplist; /* inode shrinker mount list */ + struct shrinker m_inode_shrink; /* inode reclaim shrinker */ } xfs_mount_t; /* diff --git a/include/linux/cpu.h b/include/linux/cpu.h index e287863ac05..de6b1722cdc 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -48,6 +48,31 @@ extern ssize_t arch_cpu_release(const char *, size_t); #endif struct notifier_block; +/* + * CPU notifier priorities. + */ +enum { + /* + * SCHED_ACTIVE marks a cpu which is coming up active during + * CPU_ONLINE and CPU_DOWN_FAILED and must be the first + * notifier. CPUSET_ACTIVE adjusts cpuset according to + * cpu_active mask right after SCHED_ACTIVE. During + * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are + * ordered in the similar way. + * + * This ordering guarantees consistent cpu_active mask and + * migration behavior to all cpu notifiers. + */ + CPU_PRI_SCHED_ACTIVE = INT_MAX, + CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1, + CPU_PRI_SCHED_INACTIVE = INT_MIN + 1, + CPU_PRI_CPUSET_INACTIVE = INT_MIN, + + /* migration should happen before other stuff but after perf */ + CPU_PRI_PERF = 20, + CPU_PRI_MIGRATION = 10, +}; + #ifdef CONFIG_SMP /* Need to know about CPUs going up/down? */ #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 457ed765a11..f20eb8f1602 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -20,6 +20,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */ extern int cpuset_init(void); extern void cpuset_init_smp(void); +extern void cpuset_update_active_cpus(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern int cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -132,6 +133,11 @@ static inline void set_mems_allowed(nodemask_t nodemask) static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} +static inline void cpuset_update_active_cpus(void) +{ + partition_sched_domains(1, NULL, NULL); +} + static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { diff --git a/include/linux/fb.h b/include/linux/fb.h index 8e5a9dfb76b..e7445df44d6 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -873,6 +873,8 @@ struct fb_info { static inline struct apertures_struct *alloc_apertures(unsigned int max_num) { struct apertures_struct *a = kzalloc(sizeof(struct apertures_struct) + max_num * sizeof(struct aperture), GFP_KERNEL); + if (!a) + return NULL; a->count = max_num; return a; } diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 013dc529e95..d147461bc27 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -61,7 +61,8 @@ struct files_struct { (rcu_dereference_check((fdtfd), \ rcu_read_lock_held() || \ lockdep_is_held(&(files)->file_lock) || \ - atomic_read(&(files)->count) == 1)) + atomic_read(&(files)->count) == 1 || \ + rcu_my_thread_group_empty())) #define files_fdtable(files) \ (rcu_dereference_check_fdtable((files), (files)->fdt)) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a4d2e9f7088..adf832dec3f 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1026,11 +1026,12 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); struct jbd2_buffer_trigger_type { /* - * Fired just before a buffer is written to the journal. - * mapped_data is a mapped buffer that is the frozen data for - * commit. + * Fired a the moment data to write to the journal are known to be + * stable - so either at the moment b_frozen_data is created or just + * before a buffer is written to the journal. mapped_data is a mapped + * buffer that is the frozen data for commit. */ - void (*t_commit)(struct jbd2_buffer_trigger_type *type, + void (*t_frozen)(struct jbd2_buffer_trigger_type *type, struct buffer_head *bh, void *mapped_data, size_t size); @@ -1042,7 +1043,7 @@ struct jbd2_buffer_trigger_type { struct buffer_head *bh); }; -extern void jbd2_buffer_commit_trigger(struct journal_head *jh, +extern void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data, struct jbd2_buffer_trigger_type *triggers); extern void jbd2_buffer_abort_trigger(struct journal_head *jh, diff --git a/include/linux/mm.h b/include/linux/mm.h index b969efb0378..a2b48041b91 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -999,7 +999,7 @@ static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) * querying the cache size, so a fastpath for that case is appropriate. */ struct shrinker { - int (*shrink)(int nr_to_scan, gfp_t gfp_mask); + int (*shrink)(struct shrinker *, int nr_to_scan, gfp_t gfp_mask); int seeks; /* seeks to recreate an obj */ /* These are for internal use */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 7cb00845f15..f26fda76b87 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -288,6 +288,7 @@ struct pci_dev { */ unsigned int irq; struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */ + resource_size_t fw_addr[DEVICE_COUNT_RESOURCE]; /* FW-assigned addr */ /* These fields are used by common fixups */ unsigned int transparent:1; /* Transparent PCI bridge */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5d0266d9498..469e03e96fe 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1068,7 +1068,7 @@ static inline void perf_event_disable(struct perf_event *event) { } #define perf_cpu_notifier(fn) \ do { \ static struct notifier_block fn##_nb __cpuinitdata = \ - { .notifier_call = fn, .priority = 20 }; \ + { .notifier_call = fn, .priority = CPU_PRI_PERF }; \ fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE, \ (void *)(unsigned long)smp_processor_id()); \ fn(&fn##_nb, (unsigned long)CPU_STARTING, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 6e0bb86de99..2091ea2a2c5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -271,13 +271,10 @@ extern int runqueue_is_locked(int cpu); extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) -extern int select_nohz_load_balancer(int cpu); -extern int get_nohz_load_balancer(void); +extern void select_nohz_load_balancer(int stop_tick); +extern int get_nohz_timer_target(void); #else -static inline int select_nohz_load_balancer(int cpu) -{ - return 0; -} +static inline void select_nohz_load_balancer(int stop_tick) { } #endif /* @@ -798,7 +795,7 @@ enum cpu_idle_type { #define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ - +#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ enum powersavings_balance_level { @@ -833,6 +830,8 @@ static inline int sd_balance_for_package_power(void) return SD_PREFER_SIBLING; } +extern int __weak arch_sd_sibiling_asym_packing(void); + /* * Optimise SD flags for power savings: * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings. @@ -854,7 +853,7 @@ struct sched_group { * CPU power of this group, SCHED_LOAD_SCALE being max power for a * single CPU. */ - unsigned int cpu_power; + unsigned int cpu_power, cpu_power_orig; /* * The CPUs this group covers. @@ -1690,6 +1689,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ +#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ @@ -1784,20 +1784,23 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) #endif /* - * Architectures can set this to 1 if they have specified - * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, - * but then during bootup it turns out that sched_clock() - * is reliable after all: + * Do not use outside of architecture code which knows its limitations. + * + * sched_clock() has no promise of monotonicity or bounded drift between + * CPUs, use (which you should not) requires disabling IRQs. + * + * Please use one of the three interfaces below. */ -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -extern int sched_clock_stable; -#endif - -/* ftrace calls sched_clock() directly */ extern unsigned long long notrace sched_clock(void); +/* + * See the comment in kernel/sched_clock.c + */ +extern u64 cpu_clock(int cpu); +extern u64 local_clock(void); +extern u64 sched_clock_cpu(int cpu); + extern void sched_clock_init(void); -extern u64 sched_clock_cpu(int cpu); #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK static inline void sched_clock_tick(void) @@ -1812,17 +1815,19 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } #else +/* + * Architectures can set this to 1 if they have specified + * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, + * but then during bootup it turns out that sched_clock() + * is reliable after all: + */ +extern int sched_clock_stable; + extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); #endif -/* - * For kernel-internal use: high-speed (but slightly incorrect) per-cpu - * clock constructed from sched_clock(): - */ -extern unsigned long long cpu_clock(int cpu); - extern unsigned long long task_sched_runtime(struct task_struct *task); extern unsigned long long thread_group_sched_runtime(struct task_struct *task); diff --git a/include/linux/topology.h b/include/linux/topology.h index c44df50a05a..b572e432d2f 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -103,6 +103,7 @@ int arch_update_cpu_topology(void); | 1*SD_SHARE_PKG_RESOURCES \ | 0*SD_SERIALIZE \ | 0*SD_PREFER_SIBLING \ + | arch_sd_sibling_asym_packing() \ , \ .last_balance = jiffies, \ .balance_interval = 1, \ diff --git a/include/linux/vgaarb.h b/include/linux/vgaarb.h index c9a97597699..814f294d4cd 100644 --- a/include/linux/vgaarb.h +++ b/include/linux/vgaarb.h @@ -29,6 +29,7 @@ */ #ifndef LINUX_VGA_H +#define LINUX_VGA_H #include <asm/vga.h> diff --git a/include/math-emu/op-common.h b/include/math-emu/op-common.h index fd882261225..9696a5e2c43 100644 --- a/include/math-emu/op-common.h +++ b/include/math-emu/op-common.h @@ -799,7 +799,7 @@ do { \ X##_e -= (_FP_W_TYPE_SIZE - rsize); \ X##_e = rsize - X##_e - 1; \ \ - if (_FP_FRACBITS_##fs < rsize && _FP_WFRACBITS_##fs < X##_e) \ + if (_FP_FRACBITS_##fs < rsize && _FP_WFRACBITS_##fs <= X##_e) \ __FP_FRAC_SRS_1(ur_, (X##_e - _FP_WFRACBITS_##fs + 1), rsize);\ _FP_FRAC_DISASSEMBLE_##wc(X, ur_, rsize); \ if ((_FP_WFRACBITS_##fs - X##_e - 1) > 0) \ diff --git a/include/net/sock.h b/include/net/sock.h index 731150d5279..0a691ea7654 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1224,12 +1224,7 @@ static inline void sk_tx_queue_clear(struct sock *sk) static inline int sk_tx_queue_get(const struct sock *sk) { - return sk->sk_tx_queue_mapping; -} - -static inline bool sk_tx_queue_recorded(const struct sock *sk) -{ - return (sk && sk->sk_tx_queue_mapping >= 0); + return sk ? sk->sk_tx_queue_mapping : -1; } static inline void sk_set_socket(struct sock *sk, struct socket *sock) diff --git a/ipc/sem.c b/ipc/sem.c index 506c8491a8d..40a8f462a82 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1256,6 +1256,33 @@ out: return un; } + +/** + * get_queue_result - Retrieve the result code from sem_queue + * @q: Pointer to queue structure + * + * Retrieve the return code from the pending queue. If IN_WAKEUP is found in + * q->status, then we must loop until the value is replaced with the final + * value: This may happen if a task is woken up by an unrelated event (e.g. + * signal) and in parallel the task is woken up by another task because it got + * the requested semaphores. + * + * The function can be called with or without holding the semaphore spinlock. + */ +static int get_queue_result(struct sem_queue *q) +{ + int error; + + error = q->status; + while (unlikely(error == IN_WAKEUP)) { + cpu_relax(); + error = q->status; + } + + return error; +} + + SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, unsigned, nsops, const struct timespec __user *, timeout) { @@ -1409,15 +1436,18 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, else schedule(); - error = queue.status; - while(unlikely(error == IN_WAKEUP)) { - cpu_relax(); - error = queue.status; - } + error = get_queue_result(&queue); if (error != -EINTR) { /* fast path: update_queue already obtained all requested - * resources */ + * resources. + * Perform a smp_mb(): User space could assume that semop() + * is a memory barrier: Without the mb(), the cpu could + * speculatively read in user space stale data that was + * overwritten by the previous owner of the semaphore. + */ + smp_mb(); + goto out_free; } @@ -1427,10 +1457,12 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, goto out_free; } + error = get_queue_result(&queue); + /* * If queue.status != -EINTR we are woken up by another process */ - error = queue.status; + if (error != -EINTR) { goto out_unlock_free; } diff --git a/kernel/cpu.c b/kernel/cpu.c index 97d1b426a4a..f6e726f1849 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) return -EINVAL; cpu_hotplug_begin(); - set_cpu_active(cpu, false); err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); if (err) { - set_cpu_active(cpu, true); - nr_calls--; __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); printk("%s: attempt to take down CPU %u failed\n", @@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { - set_cpu_active(cpu, true); /* CPU didn't die: tell everyone. Can't complain. */ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); @@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) goto out_notify; BUG_ON(!cpu_online(cpu)); - set_cpu_active(cpu, true); - /* Now call notifier in preparation. */ cpu_notify(CPU_ONLINE | mod, hcpu); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 02b9611eadd..7146793b5c1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root) * but making no active use of cpusets. * * This routine ensures that top_cpuset.cpus_allowed tracks - * cpu_online_map on each CPU hotplug (cpuhp) event. + * cpu_active_mask on each CPU hotplug (cpuhp) event. * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). */ -static int cpuset_track_online_cpus(struct notifier_block *unused_nb, - unsigned long phase, void *unused_cpu) +void cpuset_update_active_cpus(void) { struct sched_domain_attr *attr; cpumask_var_t *doms; int ndoms; - switch (phase) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - break; - - default: - return NOTIFY_DONE; - } - cgroup_lock(); mutex_lock(&callback_mutex); cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); @@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); - - return NOTIFY_OK; } #ifdef CONFIG_MEMORY_HOTPLUG @@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void) cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - hotcpu_notifier(cpuset_track_online_cpus, 0); hotplug_memory_notifier(cpuset_track_online_nodes, 10); cpuset_wq = create_singlethread_workqueue("cpuset"); diff --git a/kernel/early_res.c b/kernel/early_res.c index 31aa9332ef3..7bfae887f21 100644 --- a/kernel/early_res.c +++ b/kernel/early_res.c @@ -7,6 +7,8 @@ #include <linux/bootmem.h> #include <linux/mm.h> #include <linux/early_res.h> +#include <linux/slab.h> +#include <linux/kmemleak.h> /* * Early reserved memory areas. @@ -319,6 +321,8 @@ void __init free_early(u64 start, u64 end) struct early_res *r; int i; + kmemleak_free_part(__va(start), end - start); + i = find_overlapped_early(start, end); r = &early_res[i]; if (i >= max_early_res || r->end != end || r->start != start) @@ -333,6 +337,8 @@ void __init free_early_partial(u64 start, u64 end) struct early_res *r; int i; + kmemleak_free_part(__va(start), end - start); + if (start == end) return; diff --git a/kernel/fork.c b/kernel/fork.c index b6cce14ba04..a82a65cef74 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -907,7 +907,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; - new_flags &= ~PF_SUPERPRIV; + new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); new_flags |= PF_FORKNOEXEC; new_flags |= PF_STARTING; p->flags = new_flags; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 5c69e996bd0..e934339fbbe 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, static int hrtimer_get_target(int this_cpu, int pinned) { #ifdef CONFIG_NO_HZ - if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { - int preferred_cpu = get_nohz_load_balancer(); - - if (preferred_cpu >= 0) - return preferred_cpu; - } + if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) + return get_nohz_timer_target(); #endif return this_cpu; } diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 54286798c37..f2852a51023 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], static inline u64 lockstat_clock(void) { - return cpu_clock(smp_processor_id()); + return local_clock(); } static int lock_point(unsigned long points[], unsigned long ip) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index ff86c558af4..7e32b51ff04 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx) static inline u64 perf_clock(void) { - return cpu_clock(raw_smp_processor_id()); + return local_clock(); } /* diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 9829646d399..f66bdd33a6c 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { - struct sighand_struct *sighand; - struct signal_struct *sig; + struct signal_struct *sig = tsk->signal; struct task_struct *t; - *times = INIT_CPUTIME; + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; rcu_read_lock(); - sighand = rcu_dereference(tsk->sighand); - if (!sighand) + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) goto out; - sig = tsk->signal; - t = tsk; do { times->utime = cputime_add(times->utime, t->utime); times->stime = cputime_add(times->stime, t->stime); times->sum_exec_runtime += t->se.sum_exec_runtime; - - t = next_thread(t); - } while (t != tsk); - - times->utime = cputime_add(times->utime, sig->utime); - times->stime = cputime_add(times->stime, sig->stime); - times->sum_exec_runtime += sig->sum_sched_runtime; + } while_each_thread(tsk, t); out: rcu_read_unlock(); } @@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk) { struct signal_struct *sig; - /* tsk == current, ensure it is safe to use ->signal/sighand */ - if (unlikely(tsk->exit_state)) - return 0; - if (!task_cputime_zero(&tsk->cputime_expires)) { struct task_cputime task_sample = { .utime = tsk->utime, @@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (sig->cputimer.running) { struct task_cputime group_sample; - thread_group_cputimer(tsk, &group_sample); + spin_lock(&sig->cputimer.lock); + group_sample = sig->cputimer.cputime; + spin_unlock(&sig->cputimer.lock); + if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; } @@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) { LIST_HEAD(firing); struct k_itimer *timer, *next; + unsigned long flags; BUG_ON(!irqs_disabled()); @@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk) if (!fastpath_timer_check(tsk)) return; - spin_lock(&tsk->sighand->siglock); + if (!lock_task_sighand(tsk, &flags)) + return; /* * Here we take off tsk->signal->cpu_timers[N] and * tsk->cpu_timers[N] all the timers that are firing, and @@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk) * that gets the timer lock before we do will give it up and * spin until we've taken care of that timer below. */ - spin_unlock(&tsk->sighand->siglock); + unlock_task_sighand(tsk, &flags); /* * Now that all the timers on our list have the firing flag, diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 6535ac8bc6a..2e2726d790b 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -239,8 +239,7 @@ static unsigned long rcu_random(struct rcu_random_state *rrsp) { if (--rrsp->rrs_count < 0) { - rrsp->rrs_state += - (unsigned long)cpu_clock(raw_smp_processor_id()); + rrsp->rrs_state += (unsigned long)local_clock(); rrsp->rrs_count = RCU_RANDOM_REFRESH; } rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; diff --git a/kernel/sched.c b/kernel/sched.c index 63b4a14682f..f6c9bb6ac70 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -77,6 +77,7 @@ #include <asm/irq_regs.h> #include "sched_cpupri.h" +#include "workqueue_sched.h" #define CREATE_TRACE_POINTS #include <trace/events/sched.h> @@ -456,9 +457,10 @@ struct rq { unsigned long nr_running; #define CPU_LOAD_IDX_MAX 5 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long last_load_update_tick; #ifdef CONFIG_NO_HZ u64 nohz_stamp; - unsigned char in_nohz_recently; + unsigned char nohz_balance_kick; #endif unsigned int skip_clock_update; @@ -1193,6 +1195,27 @@ static void resched_cpu(int cpu) #ifdef CONFIG_NO_HZ /* + * In the semi idle case, use the nearest busy cpu for migrating timers + * from an idle cpu. This is good for power-savings. + * + * We don't do similar optimization for completely idle system, as + * selecting an idle cpu will add more delays to the timers than intended + * (as that cpu's timer base may not be uptodate wrt jiffies etc). + */ +int get_nohz_timer_target(void) +{ + int cpu = smp_processor_id(); + int i; + struct sched_domain *sd; + + for_each_domain(cpu, sd) { + for_each_cpu(i, sched_domain_span(sd)) + if (!idle_cpu(i)) + return i; + } + return cpu; +} +/* * When add_timer_on() enqueues a timer into the timer wheel of an * idle CPU then this timer might expire before the next timer event * which is scheduled to wake up that CPU. In case of a completely @@ -1642,7 +1665,7 @@ static void update_shares(struct sched_domain *sd) if (root_task_group_empty()) return; - now = cpu_clock(raw_smp_processor_id()); + now = local_clock(); elapsed = now - sd->last_update; if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { @@ -1795,6 +1818,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) static void calc_load_account_idle(struct rq *this_rq); static void update_sysctl(void); static int get_update_sysctl_factor(void); +static void update_cpu_load(struct rq *this_rq); static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { @@ -2257,11 +2281,55 @@ static void update_avg(u64 *avg, u64 sample) } #endif -/*** +static inline void ttwu_activate(struct task_struct *p, struct rq *rq, + bool is_sync, bool is_migrate, bool is_local, + unsigned long en_flags) +{ + schedstat_inc(p, se.statistics.nr_wakeups); + if (is_sync) + schedstat_inc(p, se.statistics.nr_wakeups_sync); + if (is_migrate) + schedstat_inc(p, se.statistics.nr_wakeups_migrate); + if (is_local) + schedstat_inc(p, se.statistics.nr_wakeups_local); + else + schedstat_inc(p, se.statistics.nr_wakeups_remote); + + activate_task(rq, p, en_flags); +} + +static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, + int wake_flags, bool success) +{ + trace_sched_wakeup(p, success); + check_preempt_curr(rq, p, wake_flags); + + p->state = TASK_RUNNING; +#ifdef CONFIG_SMP + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } +#endif + /* if a worker is waking up, notify workqueue */ + if ((p->flags & PF_WQ_WORKER) && success) + wq_worker_waking_up(p, cpu_of(rq)); +} + +/** * try_to_wake_up - wake up a thread - * @p: the to-be-woken-up thread + * @p: the thread to be awakened * @state: the mask of task states that can be woken - * @sync: do a synchronous wakeup? + * @wake_flags: wake modifier flags (WF_*) * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual @@ -2269,7 +2337,8 @@ static void update_avg(u64 *avg, u64 sample) * the simpler "current->state = TASK_RUNNING" to mark yourself * runnable without the overhead of this. * - * returns failure only if the task is already active. + * Returns %true if @p was woken up, %false if it was already running + * or @state didn't match @p's state. */ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) @@ -2349,38 +2418,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, out_activate: #endif /* CONFIG_SMP */ - schedstat_inc(p, se.statistics.nr_wakeups); - if (wake_flags & WF_SYNC) - schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (orig_cpu != cpu) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - if (cpu == this_cpu) - schedstat_inc(p, se.statistics.nr_wakeups_local); - else - schedstat_inc(p, se.statistics.nr_wakeups_remote); - activate_task(rq, p, en_flags); + ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, + cpu == this_cpu, en_flags); success = 1; - out_running: - trace_sched_wakeup(p, success); - check_preempt_curr(rq, p, wake_flags); - - p->state = TASK_RUNNING; -#ifdef CONFIG_SMP - if (p->sched_class->task_woken) - p->sched_class->task_woken(rq, p); - - if (unlikely(rq->idle_stamp)) { - u64 delta = rq->clock - rq->idle_stamp; - u64 max = 2*sysctl_sched_migration_cost; - - if (delta > max) - rq->avg_idle = max; - else - update_avg(&rq->avg_idle, delta); - rq->idle_stamp = 0; - } -#endif + ttwu_post_activation(p, rq, wake_flags, success); out: task_rq_unlock(rq, &flags); put_cpu(); @@ -2389,6 +2431,37 @@ out: } /** + * try_to_wake_up_local - try to wake up a local task with rq lock held + * @p: the thread to be awakened + * + * Put @p on the run-queue if it's not alredy there. The caller must + * ensure that this_rq() is locked, @p is bound to this_rq() and not + * the current task. this_rq() stays locked over invocation. + */ +static void try_to_wake_up_local(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + bool success = false; + + BUG_ON(rq != this_rq()); + BUG_ON(p == current); + lockdep_assert_held(&rq->lock); + + if (!(p->state & TASK_NORMAL)) + return; + + if (!p->se.on_rq) { + if (likely(!task_running(rq, p))) { + schedstat_inc(rq, ttwu_count); + schedstat_inc(rq, ttwu_local); + } + ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); + success = true; + } + ttwu_post_activation(p, rq, 0, success); +} + +/** * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @@ -3002,23 +3075,102 @@ static void calc_load_account_active(struct rq *this_rq) } /* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. */ static void update_cpu_load(struct rq *this_rq) { unsigned long this_load = this_rq->load.weight; + unsigned long curr_jiffies = jiffies; + unsigned long pending_updates; int i, scale; this_rq->nr_load_updates++; + /* Avoid repeated calls on same jiffy, when moving in and out of idle */ + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + /* Update our load: */ - for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { unsigned long old_load, new_load; /* scale is effectively 1 << i now, and >> i divides by scale */ old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -3026,9 +3178,15 @@ static void update_cpu_load(struct rq *this_rq) * example. */ if (new_load > old_load) - new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; } +} + +static void update_cpu_load_active(struct rq *this_rq) +{ + update_cpu_load(this_rq); calc_load_account_active(this_rq); } @@ -3416,7 +3574,7 @@ void scheduler_tick(void) raw_spin_lock(&rq->lock); update_rq_clock(rq); - update_cpu_load(rq); + update_cpu_load_active(rq); curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); @@ -3588,7 +3746,6 @@ need_resched: rq = cpu_rq(cpu); rcu_note_context_switch(cpu); prev = rq->curr; - switch_count = &prev->nivcsw; release_kernel_lock(prev); need_resched_nonpreemptible: @@ -3601,11 +3758,26 @@ need_resched_nonpreemptible: raw_spin_lock_irq(&rq->lock); clear_tsk_need_resched(prev); + switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely(signal_pending_state(prev->state, prev))) + if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; - else + } else { + /* + * If a worker is going to sleep, notify and + * ask workqueue whether it wants to wake up a + * task to maintain concurrency. If so, wake + * up the task. + */ + if (prev->flags & PF_WQ_WORKER) { + struct task_struct *to_wakeup; + + to_wakeup = wq_worker_sleeping(prev, cpu); + if (to_wakeup) + try_to_wake_up_local(to_wakeup); + } deactivate_task(rq, prev, DEQUEUE_SLEEP); + } switch_count = &prev->nvcsw; } @@ -3627,8 +3799,10 @@ need_resched_nonpreemptible: context_switch(rq, prev, next); /* unlocks the rq */ /* - * the context switch might have flipped the stack from under - * us, hence refresh the local variables. + * The context switch have flipped the stack from under us + * and restored the local variables which were saved when + * this task called schedule() in the past. prev == current + * is still correct, but it can be moved to another cpu/rq. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -3637,11 +3811,8 @@ need_resched_nonpreemptible: post_schedule(rq); - if (unlikely(reacquire_kernel_lock(current) < 0)) { - prev = rq->curr; - switch_count = &prev->nivcsw; + if (unlikely(reacquire_kernel_lock(prev))) goto need_resched_nonpreemptible; - } preempt_enable_no_resched(); if (need_resched()) @@ -4431,12 +4602,8 @@ recheck: */ if (user && !capable(CAP_SYS_NICE)) { if (rt_policy(policy)) { - unsigned long rlim_rtprio; - - if (!lock_task_sighand(p, &flags)) - return -ESRCH; - rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); - unlock_task_sighand(p, &flags); + unsigned long rlim_rtprio = + task_rlimit(p, RLIMIT_RTPRIO); /* can't set/change the rt policy */ if (policy != p->policy && !rlim_rtprio) @@ -5806,20 +5973,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, - .priority = 10 + .priority = CPU_PRI_MIGRATION, }; +static int __cpuinit sched_cpu_active(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + set_cpu_active((long)hcpu, true); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + set_cpu_active((long)hcpu, false); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + static int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; - /* Start one for the boot CPU: */ + /* Initialize migration for the boot CPU */ err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); BUG_ON(err == NOTIFY_BAD); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); + /* Register cpu active notifiers */ + cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); + cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); + return 0; } early_initcall(migration_init); @@ -6054,23 +6250,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) free_rootdomain(old_rd); } -static int init_rootdomain(struct root_domain *rd, bool bootmem) +static int init_rootdomain(struct root_domain *rd) { - gfp_t gfp = GFP_KERNEL; - memset(rd, 0, sizeof(*rd)); - if (bootmem) - gfp = GFP_NOWAIT; - - if (!alloc_cpumask_var(&rd->span, gfp)) + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&rd->online, gfp)) + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->rto_mask, gfp)) + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_online; - if (cpupri_init(&rd->cpupri, bootmem) != 0) + if (cpupri_init(&rd->cpupri) != 0) goto free_rto_mask; return 0; @@ -6086,7 +6277,7 @@ out: static void init_defrootdomain(void) { - init_rootdomain(&def_root_domain, true); + init_rootdomain(&def_root_domain); atomic_set(&def_root_domain.refcount, 1); } @@ -6099,7 +6290,7 @@ static struct root_domain *alloc_rootdomain(void) if (!rd) return NULL; - if (init_rootdomain(rd, false) != 0) { + if (init_rootdomain(rd) != 0) { kfree(rd); return NULL; } @@ -7278,29 +7469,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -#ifndef CONFIG_CPUSETS /* - * Add online and remove offline CPUs from the scheduler domains. - * When cpusets are enabled they take over this function. + * Update cpusets according to cpu_active mask. If cpusets are + * disabled, cpuset_update_active_cpus() becomes a simple wrapper + * around partition_sched_domains(). */ -static int update_sched_domains(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, + void *hcpu) { - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - partition_sched_domains(1, NULL, NULL); + cpuset_update_active_cpus(); return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} +static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + cpuset_update_active_cpus(); + return NOTIFY_OK; default: return NOTIFY_DONE; } } -#endif static int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -7346,10 +7543,8 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); put_online_cpus(); -#ifndef CONFIG_CPUSETS - /* XXX: Theoretical race here - CPU may be hotplugged now */ - hotcpu_notifier(update_sched_domains, 0); -#endif + hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); + hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); /* RT runtime code needs to handle some hotplug events */ hotcpu_notifier(update_runtime, 0); @@ -7594,6 +7789,9 @@ void __init sched_init(void) for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; + + rq->last_load_update_tick = jiffies; + #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; @@ -7607,6 +7805,10 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq_attach_root(rq, &def_root_domain); +#ifdef CONFIG_NO_HZ + rq->nohz_balance_kick = 0; + init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); +#endif #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); @@ -7651,8 +7853,11 @@ void __init sched_init(void) zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ - zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); - alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); + alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); + atomic_set(&nohz.load_balancer, nr_cpu_ids); + atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); + atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); #endif /* May be allocated at isolcpus cmdline parse time */ if (cpu_isolated_map == NULL) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 906a0f718cb..52f1a149bfb 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -10,19 +10,55 @@ * Ingo Molnar <mingo@redhat.com> * Guillaume Chazarain <guichaz@gmail.com> * - * Create a semi stable clock from a mixture of other events, including: - * - gtod + * + * What: + * + * cpu_clock(i) provides a fast (execution time) high resolution + * clock with bounded drift between CPUs. The value of cpu_clock(i) + * is monotonic for constant i. The timestamp returned is in nanoseconds. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + * + * There is no strict promise about the base, although it tends to start + * at 0 on boot (but people really shouldn't rely on that). + * + * cpu_clock(i) -- can be used from any context, including NMI. + * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) + * local_clock() -- is cpu_clock() on the current cpu. + * + * How: + * + * The implementation either uses sched_clock() when + * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the + * sched_clock() is assumed to provide these properties (mostly it means + * the architecture provides a globally synchronized highres time source). + * + * Otherwise it tries to create a semi stable clock from a mixture of other + * clocks, including: + * + * - GTOD (clock monotomic) * - sched_clock() * - explicit idle events * - * We use gtod as base and the unstable clock deltas. The deltas are filtered, - * making it monotonic and keeping it within an expected window. + * We use GTOD as base and use sched_clock() deltas to improve resolution. The + * deltas are filtered to provide monotonicity and keeping it within an + * expected window. * * Furthermore, explicit sleep and wakeup hooks allow us to account for time * that is otherwise invisible (TSC gets stopped). * - * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat - * consistent between cpus (never more than 2 jiffies difference). + * + * Notes: + * + * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things + * like cpufreq interrupts that can change the base clock (TSC) multiplier + * and cause funny jumps in time -- although the filtering provided by + * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it + * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on + * sched_clock(). */ #include <linux/spinlock.h> #include <linux/hardirq.h> @@ -170,6 +206,11 @@ again: return val; } +/* + * Similar to cpu_clock(), but requires local IRQs to be disabled. + * + * See cpu_clock(). + */ u64 sched_clock_cpu(int cpu) { struct sched_clock_data *scd; @@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); -unsigned long long cpu_clock(int cpu) +/* + * As outlined at the top, provides a fast, high resolution, nanosecond + * time source that is monotonic per cpu argument and has bounded drift + * between cpus. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + */ +u64 cpu_clock(int cpu) { - unsigned long long clock; + u64 clock; unsigned long flags; local_irq_save(flags); @@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu) return clock; } +/* + * Similar to cpu_clock() for the current cpu. Time will only be observed + * to be monotonic if care is taken to only compare timestampt taken on the + * same CPU. + * + * See cpu_clock(). + */ +u64 local_clock(void) +{ + u64 clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(smp_processor_id()); + local_irq_restore(flags); + + return clock; +} + #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ void sched_clock_init(void) @@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } - -unsigned long long cpu_clock(int cpu) +u64 cpu_clock(int cpu) { return sched_clock_cpu(cpu); } +u64 local_clock(void) +{ + return sched_clock_cpu(0); +} + #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ EXPORT_SYMBOL_GPL(cpu_clock); +EXPORT_SYMBOL_GPL(local_clock); diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index e6871cb3fc8..2722dc1b413 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) * * Returns: -ENOMEM if memory fails. */ -int cpupri_init(struct cpupri *cp, bool bootmem) +int cpupri_init(struct cpupri *cp) { - gfp_t gfp = GFP_KERNEL; int i; - if (bootmem) - gfp = GFP_NOWAIT; - memset(cp, 0, sizeof(*cp)); for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { @@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem) raw_spin_lock_init(&vec->lock); vec->count = 0; - if (!zalloc_cpumask_var(&vec->mask, gfp)) + if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) goto cleanup; } diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h index 7cb5bb6b95b..9fc7d386fea 100644 --- a/kernel/sched_cpupri.h +++ b/kernel/sched_cpupri.h @@ -27,7 +27,7 @@ struct cpupri { int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); void cpupri_set(struct cpupri *cp, int cpu, int pri); -int cpupri_init(struct cpupri *cp, bool bootmem); +int cpupri_init(struct cpupri *cp); void cpupri_cleanup(struct cpupri *cp); #else #define cpupri_set(cp, cpu, pri) do { } while (0) diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 35565395d00..2e1b0d17dd9 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v) PN(sysctl_sched_latency); PN(sysctl_sched_min_granularity); PN(sysctl_sched_wakeup_granularity); - PN(sysctl_sched_child_runs_first); + P(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN #undef P diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a878b5332da..806d1b227a2 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2287,13 +2287,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; - if (sched_feat(ARCH_POWER)) - power *= arch_scale_freq_power(sd, cpu); - else - power *= default_scale_freq_power(sd, cpu); - - power >>= SCHED_LOAD_SHIFT; - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { if (sched_feat(ARCH_POWER)) power *= arch_scale_smt_power(sd, cpu); @@ -2303,6 +2296,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) power >>= SCHED_LOAD_SHIFT; } + sdg->cpu_power_orig = power; + + if (sched_feat(ARCH_POWER)) + power *= arch_scale_freq_power(sd, cpu); + else + power *= default_scale_freq_power(sd, cpu); + + power >>= SCHED_LOAD_SHIFT; + power *= scale_rt_power(cpu); power >>= SCHED_LOAD_SHIFT; @@ -2335,6 +2337,31 @@ static void update_group_power(struct sched_domain *sd, int cpu) sdg->cpu_power = power; } +/* + * Try and fix up capacity for tiny siblings, this is needed when + * things like SD_ASYM_PACKING need f_b_g to select another sibling + * which on its own isn't powerful enough. + * + * See update_sd_pick_busiest() and check_asym_packing(). + */ +static inline int +fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +{ + /* + * Only siblings can have significantly less than SCHED_LOAD_SCALE + */ + if (sd->level != SD_LV_SIBLING) + return 0; + + /* + * If ~90% of the cpu_power is still there, we're good. + */ + if (group->cpu_power * 32 > group->cpu_power_orig * 29) + return 1; + + return 0; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @sd: The sched_domain whose statistics are to be updated. @@ -2400,14 +2427,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * domains. In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. */ - if (idle != CPU_NEWLY_IDLE && local_group && - balance_cpu != this_cpu) { - *balance = 0; - return; + if (idle != CPU_NEWLY_IDLE && local_group) { + if (balance_cpu != this_cpu) { + *balance = 0; + return; + } + update_group_power(sd, this_cpu); } - update_group_power(sd, this_cpu); - /* Adjust by relative CPU power of the group */ sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; @@ -2428,6 +2455,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); + if (!sgs->group_capacity) + sgs->group_capacity = fix_small_capacity(sd, group); +} + +/** + * update_sd_pick_busiest - return 1 on busiest group + * @sd: sched_domain whose statistics are to be checked + * @sds: sched_domain statistics + * @sg: sched_group candidate to be checked for being the busiest + * @sgs: sched_group statistics + * @this_cpu: the current cpu + * + * Determine if @sg is a busier group than the previously selected + * busiest group. + */ +static bool update_sd_pick_busiest(struct sched_domain *sd, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs, + int this_cpu) +{ + if (sgs->avg_load <= sds->max_load) + return false; + + if (sgs->sum_nr_running > sgs->group_capacity) + return true; + + if (sgs->group_imb) + return true; + + /* + * ASYM_PACKING needs to move all the work to the lowest + * numbered CPUs in the group, therefore mark all groups + * higher than ourself as busy. + */ + if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && + this_cpu < group_first_cpu(sg)) { + if (!sds->busiest) + return true; + + if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) + return true; + } + + return false; } /** @@ -2435,7 +2507,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * @sd: sched_domain whose statistics are to be updated. * @this_cpu: Cpu for which load balance is currently performed. * @idle: Idle status of this_cpu - * @sd_idle: Idle status of the sched_domain containing group. + * @sd_idle: Idle status of the sched_domain containing sg. * @cpus: Set of cpus considered for load balancing. * @balance: Should we balance. * @sds: variable to hold the statistics for this sched_domain. @@ -2446,7 +2518,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, struct sd_lb_stats *sds) { struct sched_domain *child = sd->child; - struct sched_group *group = sd->groups; + struct sched_group *sg = sd->groups; struct sg_lb_stats sgs; int load_idx, prefer_sibling = 0; @@ -2459,21 +2531,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, do { int local_group; - local_group = cpumask_test_cpu(this_cpu, - sched_group_cpus(group)); + local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); if (local_group && !(*balance)) return; sds->total_load += sgs.group_load; - sds->total_pwr += group->cpu_power; + sds->total_pwr += sg->cpu_power; /* * In case the child domain prefers tasks go to siblings - * first, lower the group capacity to one so that we'll try + * first, lower the sg capacity to one so that we'll try * and move all the excess tasks away. */ if (prefer_sibling) @@ -2481,23 +2552,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, if (local_group) { sds->this_load = sgs.avg_load; - sds->this = group; + sds->this = sg; sds->this_nr_running = sgs.sum_nr_running; sds->this_load_per_task = sgs.sum_weighted_load; - } else if (sgs.avg_load > sds->max_load && - (sgs.sum_nr_running > sgs.group_capacity || - sgs.group_imb)) { + } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { sds->max_load = sgs.avg_load; - sds->busiest = group; + sds->busiest = sg; sds->busiest_nr_running = sgs.sum_nr_running; sds->busiest_group_capacity = sgs.group_capacity; sds->busiest_load_per_task = sgs.sum_weighted_load; sds->group_imb = sgs.group_imb; } - update_sd_power_savings_stats(group, sds, local_group, &sgs); - group = group->next; - } while (group != sd->groups); + update_sd_power_savings_stats(sg, sds, local_group, &sgs); + sg = sg->next; + } while (sg != sd->groups); +} + +int __weak arch_sd_sibling_asym_packing(void) +{ + return 0*SD_ASYM_PACKING; +} + +/** + * check_asym_packing - Check to see if the group is packed into the + * sched doman. + * + * This is primarily intended to used at the sibling level. Some + * cores like POWER7 prefer to use lower numbered SMT threads. In the + * case of POWER7, it can move to lower SMT modes only when higher + * threads are idle. When in lower SMT modes, the threads will + * perform better since they share less core resources. Hence when we + * have idle threads, we want them to be the higher ones. + * + * This packing function is run on idle threads. It checks to see if + * the busiest CPU in this domain (core in the P7 case) has a higher + * CPU number than the packing function is being run on. Here we are + * assuming lower CPU number will be equivalent to lower a SMT thread + * number. + * + * Returns 1 when packing is required and a task should be moved to + * this CPU. The amount of the imbalance is returned in *imbalance. + * + * @sd: The sched_domain whose packing is to be checked. + * @sds: Statistics of the sched_domain which is to be packed + * @this_cpu: The cpu at whose sched_domain we're performing load-balance. + * @imbalance: returns amount of imbalanced due to packing. + */ +static int check_asym_packing(struct sched_domain *sd, + struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + int busiest_cpu; + + if (!(sd->flags & SD_ASYM_PACKING)) + return 0; + + if (!sds->busiest) + return 0; + + busiest_cpu = group_first_cpu(sds->busiest); + if (this_cpu > busiest_cpu) + return 0; + + *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, + SCHED_LOAD_SCALE); + return 1; } /** @@ -2692,6 +2812,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!(*balance)) goto ret; + if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && + check_asym_packing(sd, &sds, this_cpu, imbalance)) + return sds.busiest; + if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; @@ -2726,8 +2850,9 @@ ret: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq * -find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, - unsigned long imbalance, const struct cpumask *cpus) +find_busiest_queue(struct sched_domain *sd, struct sched_group *group, + enum cpu_idle_type idle, unsigned long imbalance, + const struct cpumask *cpus) { struct rq *busiest = NULL, *rq; unsigned long max_load = 0; @@ -2738,6 +2863,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); unsigned long wl; + if (!capacity) + capacity = fix_small_capacity(sd, group); + if (!cpumask_test_cpu(i, cpus)) continue; @@ -2777,9 +2905,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, /* Working cpumask for load_balance and load_balance_newidle. */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); -static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) +static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle, + int busiest_cpu, int this_cpu) { if (idle == CPU_NEWLY_IDLE) { + + /* + * ASYM_PACKING needs to force migrate tasks from busy but + * higher numbered CPUs in order to pack all tasks in the + * lowest numbered CPUs. + */ + if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) + return 1; + /* * The only task running in a non-idle cpu can be moved to this * cpu in an attempt to completely freeup the other CPU @@ -2854,7 +2992,7 @@ redo: goto out_balanced; } - busiest = find_busiest_queue(group, idle, imbalance, cpus); + busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -2898,7 +3036,8 @@ redo: schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; - if (need_active_balance(sd, sd_idle, idle)) { + if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), + this_cpu)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -3093,13 +3232,40 @@ out_unlock: } #ifdef CONFIG_NO_HZ + +static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); + +static void trigger_sched_softirq(void *data) +{ + raise_softirq_irqoff(SCHED_SOFTIRQ); +} + +static inline void init_sched_softirq_csd(struct call_single_data *csd) +{ + csd->func = trigger_sched_softirq; + csd->info = NULL; + csd->flags = 0; + csd->priv = 0; +} + +/* + * idle load balancing details + * - One of the idle CPUs nominates itself as idle load_balancer, while + * entering idle. + * - This idle load balancer CPU will also go into tickless mode when + * it is idle, just like all other idle CPUs + * - When one of the busy CPUs notice that there may be an idle rebalancing + * needed, they will kick the idle load balancer, which then does idle + * load balancing for all the idle CPUs. + */ static struct { atomic_t load_balancer; - cpumask_var_t cpu_mask; - cpumask_var_t ilb_grp_nohz_mask; -} nohz ____cacheline_aligned = { - .load_balancer = ATOMIC_INIT(-1), -}; + atomic_t first_pick_cpu; + atomic_t second_pick_cpu; + cpumask_var_t idle_cpus_mask; + cpumask_var_t grp_idle_mask; + unsigned long next_balance; /* in jiffy units */ +} nohz ____cacheline_aligned; int get_nohz_load_balancer(void) { @@ -3153,17 +3319,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) */ static inline int is_semi_idle_group(struct sched_group *ilb_group) { - cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, sched_group_cpus(ilb_group)); /* * A sched_group is semi-idle when it has atleast one busy cpu * and atleast one idle cpu. */ - if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + if (cpumask_empty(nohz.grp_idle_mask)) return 0; - if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) return 0; return 1; @@ -3196,7 +3362,7 @@ static int find_new_ilb(int cpu) * Optimize for the case when we have no idle CPUs or only one * idle CPU. Don't walk the sched_domain hierarchy in such cases */ - if (cpumask_weight(nohz.cpu_mask) < 2) + if (cpumask_weight(nohz.idle_cpus_mask) < 2) goto out_done; for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { @@ -3204,7 +3370,7 @@ static int find_new_ilb(int cpu) do { if (is_semi_idle_group(ilb_group)) - return cpumask_first(nohz.ilb_grp_nohz_mask); + return cpumask_first(nohz.grp_idle_mask); ilb_group = ilb_group->next; @@ -3212,98 +3378,116 @@ static int find_new_ilb(int cpu) } out_done: - return cpumask_first(nohz.cpu_mask); + return nr_cpu_ids; } #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) { - return cpumask_first(nohz.cpu_mask); + return nr_cpu_ids; } #endif /* + * Kick a CPU to do the nohz balancing, if it is time for it. We pick the + * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle + * CPU (if there is one). + */ +static void nohz_balancer_kick(int cpu) +{ + int ilb_cpu; + + nohz.next_balance++; + + ilb_cpu = get_nohz_load_balancer(); + + if (ilb_cpu >= nr_cpu_ids) { + ilb_cpu = cpumask_first(nohz.idle_cpus_mask); + if (ilb_cpu >= nr_cpu_ids) + return; + } + + if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { + struct call_single_data *cp; + + cpu_rq(ilb_cpu)->nohz_balance_kick = 1; + cp = &per_cpu(remote_sched_softirq_cb, cpu); + __smp_call_function_single(ilb_cpu, cp, 0); + } + return; +} + +/* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle - * load balancing on behalf of all those cpus. If all the cpus in the system - * go into this tickless mode, then there will be no ilb owner (as there is - * no need for one) and all the cpus will sleep till the next wakeup event - * arrives... - * - * For the ilb owner, tick is not stopped. And this tick will be used - * for idle load balancing. ilb owner will still be part of - * nohz.cpu_mask.. + * load balancing on behalf of all those cpus. * - * While stopping the tick, this cpu will become the ilb owner if there - * is no other owner. And will be the owner till that cpu becomes busy - * or if all cpus in the system stop their ticks at which point - * there is no need for ilb owner. + * When the ilb owner becomes busy, we will not have new ilb owner until some + * idle CPU wakes up and goes back to idle or some busy CPU tries to kick + * idle load balancing by kicking one of the idle CPUs. * - * When the ilb owner becomes busy, it nominates another owner, during the - * next busy scheduler_tick() + * Ticks are stopped for the ilb owner as well, with busy CPU kicking this + * ilb owner CPU in future (when there is a need for idle load balancing on + * behalf of all idle CPUs). */ -int select_nohz_load_balancer(int stop_tick) +void select_nohz_load_balancer(int stop_tick) { int cpu = smp_processor_id(); if (stop_tick) { - cpu_rq(cpu)->in_nohz_recently = 1; - if (!cpu_active(cpu)) { if (atomic_read(&nohz.load_balancer) != cpu) - return 0; + return; /* * If we are going offline and still the leader, * give up! */ - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) BUG(); - return 0; + return; } - cpumask_set_cpu(cpu, nohz.cpu_mask); + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); - /* time for ilb owner also to sleep */ - if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { - if (atomic_read(&nohz.load_balancer) == cpu) - atomic_set(&nohz.load_balancer, -1); - return 0; - } + if (atomic_read(&nohz.first_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); + if (atomic_read(&nohz.second_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); - if (atomic_read(&nohz.load_balancer) == -1) { - /* make me the ilb owner */ - if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) - return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) { + if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { int new_ilb; - if (!(sched_smt_power_savings || - sched_mc_power_savings)) - return 1; + /* make me the ilb owner */ + if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, + cpu) != nr_cpu_ids) + return; + /* * Check to see if there is a more power-efficient * ilb. */ new_ilb = find_new_ilb(cpu); if (new_ilb < nr_cpu_ids && new_ilb != cpu) { - atomic_set(&nohz.load_balancer, -1); + atomic_set(&nohz.load_balancer, nr_cpu_ids); resched_cpu(new_ilb); - return 0; + return; } - return 1; + return; } } else { - if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) - return 0; + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return; - cpumask_clear_cpu(cpu, nohz.cpu_mask); + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); if (atomic_read(&nohz.load_balancer) == cpu) - if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) BUG(); } - return 0; + return; } #endif @@ -3385,11 +3569,102 @@ out: rq->next_balance = next_balance; } +#ifdef CONFIG_NO_HZ /* - * run_rebalance_domains is triggered when needed from the scheduler tick. - * In CONFIG_NO_HZ case, the idle load balance owner will do the + * In CONFIG_NO_HZ case, the idle balance kickee will do the * rebalancing for all the cpus for whom scheduler ticks are stopped. */ +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +{ + struct rq *this_rq = cpu_rq(this_cpu); + struct rq *rq; + int balance_cpu; + + if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) + return; + + for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { + if (balance_cpu == this_cpu) + continue; + + /* + * If this cpu gets work to do, stop the load balancing + * work being done for other cpus. Next load + * balancing owner will pick it up. + */ + if (need_resched()) { + this_rq->nohz_balance_kick = 0; + break; + } + + raw_spin_lock_irq(&this_rq->lock); + update_rq_clock(this_rq); + update_cpu_load(this_rq); + raw_spin_unlock_irq(&this_rq->lock); + + rebalance_domains(balance_cpu, CPU_IDLE); + + rq = cpu_rq(balance_cpu); + if (time_after(this_rq->next_balance, rq->next_balance)) + this_rq->next_balance = rq->next_balance; + } + nohz.next_balance = this_rq->next_balance; + this_rq->nohz_balance_kick = 0; +} + +/* + * Current heuristic for kicking the idle load balancer + * - first_pick_cpu is the one of the busy CPUs. It will kick + * idle load balancer when it has more than one process active. This + * eliminates the need for idle load balancing altogether when we have + * only one running process in the system (common case). + * - If there are more than one busy CPU, idle load balancer may have + * to run for active_load_balance to happen (i.e., two busy CPUs are + * SMT or core siblings and can run better if they move to different + * physical CPUs). So, second_pick_cpu is the second of the busy CPUs + * which will kick idle load balancer as soon as it has any load. + */ +static inline int nohz_kick_needed(struct rq *rq, int cpu) +{ + unsigned long now = jiffies; + int ret; + int first_pick_cpu, second_pick_cpu; + + if (time_before(now, nohz.next_balance)) + return 0; + + if (!rq->nr_running) + return 0; + + first_pick_cpu = atomic_read(&nohz.first_pick_cpu); + second_pick_cpu = atomic_read(&nohz.second_pick_cpu); + + if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && + second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) + return 0; + + ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); + if (rq->nr_running > 1) + return 1; + } else { + ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + if (rq->nr_running) + return 1; + } + } + return 0; +} +#else +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +#endif + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ static void run_rebalance_domains(struct softirq_action *h) { int this_cpu = smp_processor_id(); @@ -3399,37 +3674,12 @@ static void run_rebalance_domains(struct softirq_action *h) rebalance_domains(this_cpu, idle); -#ifdef CONFIG_NO_HZ /* - * If this cpu is the owner for idle load balancing, then do the + * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. */ - if (this_rq->idle_at_tick && - atomic_read(&nohz.load_balancer) == this_cpu) { - struct rq *rq; - int balance_cpu; - - for_each_cpu(balance_cpu, nohz.cpu_mask) { - if (balance_cpu == this_cpu) - continue; - - /* - * If this cpu gets work to do, stop the load balancing - * work being done for other cpus. Next load - * balancing owner will pick it up. - */ - if (need_resched()) - break; - - rebalance_domains(balance_cpu, CPU_IDLE); - - rq = cpu_rq(balance_cpu); - if (time_after(this_rq->next_balance, rq->next_balance)) - this_rq->next_balance = rq->next_balance; - } - } -#endif + nohz_idle_balance(this_cpu, idle); } static inline int on_null_domain(int cpu) @@ -3439,57 +3689,17 @@ static inline int on_null_domain(int cpu) /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. - * - * In case of CONFIG_NO_HZ, this is the place where we nominate a new - * idle load balancing owner or decide to stop the periodic load balancing, - * if the whole system is idle. */ static inline void trigger_load_balance(struct rq *rq, int cpu) { -#ifdef CONFIG_NO_HZ - /* - * If we were in the nohz mode recently and busy at the current - * scheduler tick, then check if we need to nominate new idle - * load balancer. - */ - if (rq->in_nohz_recently && !rq->idle_at_tick) { - rq->in_nohz_recently = 0; - - if (atomic_read(&nohz.load_balancer) == cpu) { - cpumask_clear_cpu(cpu, nohz.cpu_mask); - atomic_set(&nohz.load_balancer, -1); - } - - if (atomic_read(&nohz.load_balancer) == -1) { - int ilb = find_new_ilb(cpu); - - if (ilb < nr_cpu_ids) - resched_cpu(ilb); - } - } - - /* - * If this cpu is idle and doing idle load balancing for all the - * cpus with ticks stopped, is it time for that to stop? - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && - cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { - resched_cpu(cpu); - return; - } - - /* - * If this cpu is idle and the idle load balancing is done by - * someone else, then no need raise the SCHED_SOFTIRQ - */ - if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && - cpumask_test_cpu(cpu, nohz.cpu_mask)) - return; -#endif /* Don't need to rebalance while attached to NULL domain */ if (time_after_eq(jiffies, rq->next_balance) && likely(!on_null_domain(cpu))) raise_softirq(SCHED_SOFTIRQ); +#ifdef CONFIG_NO_HZ + else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) + nohz_balancer_kick(cpu); +#endif } static void rq_online_fair(struct rq *rq) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8afb953e31c..d10c80ebb67 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1663,9 +1663,6 @@ static void watchdog(struct rq *rq, struct task_struct *p) { unsigned long soft, hard; - if (!p->signal) - return; - /* max may change after cur was read, this will be fixed next tick */ soft = task_rlimit(p, RLIMIT_RTTIME); hard = task_rlimit_max(p, RLIMIT_RTTIME); diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 32d2bd4061b..25c2f962f6f 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) static inline void account_group_user_time(struct task_struct *tsk, cputime_t cputime) { - struct thread_group_cputimer *cputimer; - - /* tsk == current, ensure it is safe to use ->signal */ - if (unlikely(tsk->exit_state)) - return; - - cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; if (!cputimer->running) return; @@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk, static inline void account_group_system_time(struct task_struct *tsk, cputime_t cputime) { - struct thread_group_cputimer *cputimer; - - /* tsk == current, ensure it is safe to use ->signal */ - if (unlikely(tsk->exit_state)) - return; - - cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; if (!cputimer->running) return; @@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk, static inline void account_group_exec_runtime(struct task_struct *tsk, unsigned long long ns) { - struct thread_group_cputimer *cputimer; - struct signal_struct *sig; - - sig = tsk->signal; - /* see __exit_signal()->task_rq_unlock_wait() */ - barrier(); - if (unlikely(!sig)) - return; - - cputimer = &sig->cputimer; + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; if (!cputimer->running) return; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f898af60817..021d2f878f1 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle) * the scheduler tick in nohz_restart_sched_tick. */ if (!ts->tick_stopped) { - if (select_nohz_load_balancer(1)) { - /* - * sched tick not stopped! - */ - cpumask_clear_cpu(cpu, nohz_cpu_mask); - goto out; - } + select_nohz_load_balancer(1); ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; diff --git a/kernel/timer.c b/kernel/timer.c index ee305c8d4e1..48d6aec0789 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -679,12 +679,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, cpu = smp_processor_id(); #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { - int preferred_cpu = get_nohz_load_balancer(); - - if (preferred_cpu >= 0) - cpu = preferred_cpu; - } + if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) + cpu = get_nohz_timer_target(); #endif new_base = per_cpu(tvec_bases, cpu); diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 9d589d8dcd1..1723e2b8c58 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -56,7 +56,7 @@ u64 notrace trace_clock_local(void) */ u64 notrace trace_clock(void) { - return cpu_clock(raw_smp_processor_id()); + return local_clock(); } diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h new file mode 100644 index 00000000000..af040babb74 --- /dev/null +++ b/kernel/workqueue_sched.h @@ -0,0 +1,16 @@ +/* + * kernel/workqueue_sched.h + * + * Scheduler hooks for concurrency managed workqueue. Only to be + * included from sched.c and workqueue.c. + */ +static inline void wq_worker_waking_up(struct task_struct *task, + unsigned int cpu) +{ +} + +static inline struct task_struct *wq_worker_sleeping(struct task_struct *task, + unsigned int cpu) +{ + return NULL; +} diff --git a/mm/bootmem.c b/mm/bootmem.c index 58c66cc5056..142c84a5499 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -833,15 +833,24 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { + void *ptr; + if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); #ifdef CONFIG_NO_BOOTMEM - return __alloc_memory_core_early(pgdat->node_id, size, align, + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); + if (ptr) + return ptr; + + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, -1ULL); #else - return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); + ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); #endif + + return ptr; } void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, @@ -977,14 +986,21 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { + void *ptr; + if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); #ifdef CONFIG_NO_BOOTMEM - return __alloc_memory_core_early(pgdat->node_id, size, align, + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); + if (ptr) + return ptr; + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); #else - return ___alloc_bootmem_node(pgdat->bdata, size, align, + ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); #endif + return ptr; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 431214b941a..9bd339eb04c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3634,6 +3634,9 @@ void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, int i; void *ptr; + if (limit > get_max_mapped()) + limit = get_max_mapped(); + /* need to go over early_node_map to find out good range for node */ for_each_active_range_index_in_nid(i, nid) { u64 addr; @@ -3659,6 +3662,11 @@ void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, ptr = phys_to_virt(addr); memset(ptr, 0, size); reserve_early_without_check(addr, addr + size, "BOOTMEM"); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(ptr, size, 0, 0); return ptr; } diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 6c0081441a3..5bffada7cde 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -9,6 +9,7 @@ #include <linux/vmalloc.h> #include <linux/cgroup.h> #include <linux/swapops.h> +#include <linux/kmemleak.h> static void __meminit __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) @@ -126,6 +127,12 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) if (!base) base = vmalloc(table_size); } + /* + * The value stored in section->page_cgroup is (base - pfn) + * and it does not point to the memory block allocated above, + * causing kmemleak false positives. + */ + kmemleak_not_leak(base); } else { /* * We don't have to allocate page_cgroup again, but diff --git a/mm/vmscan.c b/mm/vmscan.c index 9c7e57cc63a..b94fe1b3da4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -213,8 +213,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; unsigned long total_scan; - unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask); + unsigned long max_pass; + max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask); delta = (4 * scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); @@ -242,8 +243,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, int shrink_ret; int nr_before; - nr_before = (*shrinker->shrink)(0, gfp_mask); - shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); + nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask); + shrink_ret = (*shrinker->shrink)(shrinker, this_scan, + gfp_mask); if (shrink_ret == -1) break; if (shrink_ret < nr_before) @@ -296,7 +298,7 @@ static int may_write_to_queue(struct backing_dev_info *bdi) static void handle_write_error(struct address_space *mapping, struct page *page, int error) { - lock_page(page); + lock_page_nosync(page); if (page_mapping(page) == mapping) mapping_set_error(mapping, error); unlock_page(page); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index b10e3cdb08f..800b6b9fbba 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -358,6 +358,11 @@ struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 acl->sec_level = sec_level; acl->auth_type = auth_type; hci_acl_connect(acl); + } else { + if (acl->sec_level < sec_level) + acl->sec_level = sec_level; + if (acl->auth_type < auth_type) + acl->auth_type = auth_type; } if (type == ACL_LINK) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 6c57fc71c7e..786b5de0bac 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1049,6 +1049,8 @@ static inline void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *s if (conn) { if (!ev->status) conn->link_mode |= HCI_LM_AUTH; + else + conn->sec_level = BT_SECURITY_LOW; clear_bit(HCI_CONN_AUTH_PEND, &conn->pend); diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 1b682a5aa06..cf3c4073a8a 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -401,6 +401,11 @@ static inline void l2cap_send_rr_or_rnr(struct l2cap_pinfo *pi, u16 control) l2cap_send_sframe(pi, control); } +static inline int __l2cap_no_conn_pending(struct sock *sk) +{ + return !(l2cap_pi(sk)->conf_state & L2CAP_CONF_CONNECT_PEND); +} + static void l2cap_do_start(struct sock *sk) { struct l2cap_conn *conn = l2cap_pi(sk)->conn; @@ -409,12 +414,13 @@ static void l2cap_do_start(struct sock *sk) if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE)) return; - if (l2cap_check_security(sk)) { + if (l2cap_check_security(sk) && __l2cap_no_conn_pending(sk)) { struct l2cap_conn_req req; req.scid = cpu_to_le16(l2cap_pi(sk)->scid); req.psm = l2cap_pi(sk)->psm; l2cap_pi(sk)->ident = l2cap_get_ident(conn); + l2cap_pi(sk)->conf_state |= L2CAP_CONF_CONNECT_PEND; l2cap_send_cmd(conn, l2cap_pi(sk)->ident, L2CAP_CONN_REQ, sizeof(req), &req); @@ -464,12 +470,14 @@ static void l2cap_conn_start(struct l2cap_conn *conn) } if (sk->sk_state == BT_CONNECT) { - if (l2cap_check_security(sk)) { + if (l2cap_check_security(sk) && + __l2cap_no_conn_pending(sk)) { struct l2cap_conn_req req; req.scid = cpu_to_le16(l2cap_pi(sk)->scid); req.psm = l2cap_pi(sk)->psm; l2cap_pi(sk)->ident = l2cap_get_ident(conn); + l2cap_pi(sk)->conf_state |= L2CAP_CONF_CONNECT_PEND; l2cap_send_cmd(conn, l2cap_pi(sk)->ident, L2CAP_CONN_REQ, sizeof(req), &req); @@ -2912,7 +2920,6 @@ static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hd l2cap_pi(sk)->ident = 0; l2cap_pi(sk)->dcid = dcid; l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT; - l2cap_pi(sk)->conf_state &= ~L2CAP_CONF_CONNECT_PEND; l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, @@ -4404,6 +4411,7 @@ static int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) req.psm = l2cap_pi(sk)->psm; l2cap_pi(sk)->ident = l2cap_get_ident(conn); + l2cap_pi(sk)->conf_state |= L2CAP_CONF_CONNECT_PEND; l2cap_send_cmd(conn, l2cap_pi(sk)->ident, L2CAP_CONN_REQ, sizeof(req), &req); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index eedf2c94820..753fc4221f3 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -217,14 +217,6 @@ static bool br_devices_support_netpoll(struct net_bridge *br) return count != 0 && ret; } -static void br_poll_controller(struct net_device *br_dev) -{ - struct netpoll *np = br_dev->npinfo->netpoll; - - if (np->real_dev != br_dev) - netpoll_poll_dev(np->real_dev); -} - void br_netpoll_cleanup(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -295,7 +287,6 @@ static const struct net_device_ops br_netdev_ops = { .ndo_do_ioctl = br_dev_ioctl, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_cleanup = br_netpoll_cleanup, - .ndo_poll_controller = br_poll_controller, #endif }; diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index a4e72a89e4f..595da45f908 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -50,14 +50,7 @@ int br_dev_queue_push_xmit(struct sk_buff *skb) kfree_skb(skb); else { skb_push(skb, ETH_HLEN); - -#ifdef CONFIG_NET_POLL_CONTROLLER - if (unlikely(skb->dev->priv_flags & IFF_IN_NETPOLL)) { - netpoll_send_skb(skb->dev->npinfo->netpoll, skb); - skb->dev->priv_flags &= ~IFF_IN_NETPOLL; - } else -#endif - dev_queue_xmit(skb); + dev_queue_xmit(skb); } } @@ -73,23 +66,9 @@ int br_forward_finish(struct sk_buff *skb) static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) { -#ifdef CONFIG_NET_POLL_CONTROLLER - struct net_bridge *br = to->br; - if (unlikely(br->dev->priv_flags & IFF_IN_NETPOLL)) { - struct netpoll *np; - to->dev->npinfo = skb->dev->npinfo; - np = skb->dev->npinfo->netpoll; - np->real_dev = np->dev = to->dev; - to->dev->priv_flags |= IFF_IN_NETPOLL; - } -#endif skb->dev = to->dev; NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, br_forward_finish); -#ifdef CONFIG_NET_POLL_CONTROLLER - if (skb->dev->npinfo) - skb->dev->npinfo->netpoll->dev = br->dev; -#endif } static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) diff --git a/net/core/dev.c b/net/core/dev.c index 723a34710ad..0ea10f849be 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1911,8 +1911,16 @@ static int dev_gso_segment(struct sk_buff *skb) */ static inline void skb_orphan_try(struct sk_buff *skb) { - if (!skb_tx(skb)->flags) + struct sock *sk = skb->sk; + + if (sk && !skb_tx(skb)->flags) { + /* skb_tx_hash() wont be able to get sk. + * We copy sk_hash into skb->rxhash + */ + if (!skb->rxhash) + skb->rxhash = sk->sk_hash; skb_orphan(skb); + } } int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, @@ -1998,8 +2006,7 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) if (skb->sk && skb->sk->sk_hash) hash = skb->sk->sk_hash; else - hash = (__force u16) skb->protocol; - + hash = (__force u16) skb->protocol ^ skb->rxhash; hash = jhash_1word(hash, hashrnd); return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); @@ -2022,12 +2029,11 @@ static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index) static struct netdev_queue *dev_pick_tx(struct net_device *dev, struct sk_buff *skb) { - u16 queue_index; + int queue_index; struct sock *sk = skb->sk; - if (sk_tx_queue_recorded(sk)) { - queue_index = sk_tx_queue_get(sk); - } else { + queue_index = sk_tx_queue_get(sk); + if (queue_index < 0) { const struct net_device_ops *ops = dev->netdev_ops; if (ops->ndo_select_queue) { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 6ba1c0eece0..a4e0a7482c2 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -949,7 +949,10 @@ static void neigh_update_hhs(struct neighbour *neigh) { struct hh_cache *hh; void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) - = neigh->dev->header_ops->cache_update; + = NULL; + + if (neigh->dev->header_ops) + update = neigh->dev->header_ops->cache_update; if (update) { for (hh = neigh->hh; hh; hh = hh->hh_next) { diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index c51b55400dc..11201784d29 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -1,7 +1,7 @@ menuconfig NET_DSA bool "Distributed Switch Architecture support" default n - depends on EXPERIMENTAL && !S390 + depends on EXPERIMENTAL && NET_ETHERNET && !S390 select PHYLIB ---help--- This allows you to use hardware switch chips that use diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 757f25eb9b4..7f6273506ee 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -442,8 +442,10 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) int err; err = ipmr_fib_lookup(net, &fl, &mrt); - if (err < 0) + if (err < 0) { + kfree_skb(skb); return err; + } read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; @@ -1728,8 +1730,10 @@ int ip_mr_input(struct sk_buff *skb) goto dont_forward; err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); - if (err < 0) + if (err < 0) { + kfree_skb(skb); return err; + } if (!local) { if (IPCB(skb)->opt.router_alert) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6596b4feedd..65afeaec15b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -608,6 +608,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, ssize_t spliced; int ret; + sock_rps_record_flow(sk); /* * We can't seek on a socket input */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b4ed957f201..7ed9dc1042d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2208,6 +2208,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk) int mib_idx; int fwd_rexmitting = 0; + if (!tp->packets_out) + return; + if (!tp->lost_out) tp->retransmit_high = tp->snd_una; diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index 2794b600283..d6e9599d070 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -347,11 +347,12 @@ static const struct xfrm_type mip6_destopt_type = static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb) { + struct ipv6hdr *iph = ipv6_hdr(skb); struct rt2_hdr *rt2 = (struct rt2_hdr *)skb->data; int err = rt2->rt_hdr.nexthdr; spin_lock(&x->lock); - if (!ipv6_addr_equal(&rt2->addr, (struct in6_addr *)x->coaddr) && + if (!ipv6_addr_equal(&iph->daddr, (struct in6_addr *)x->coaddr) && !ipv6_addr_any((struct in6_addr *)x->coaddr)) err = -ENOENT; spin_unlock(&x->lock); diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 94d72e85a47..b2a3ae6cad7 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -698,6 +698,7 @@ static struct sock *pep_sock_accept(struct sock *sk, int flags, int *errp) newsk = NULL; goto out; } + kfree_skb(oskb); sock_hold(sk); pep_sk(newsk)->listener = sk; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 570949417f3..724553e8ed7 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -205,7 +205,7 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a, { struct icmphdr *icmph; - if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph))) + if (!pskb_may_pull(skb, ihl + sizeof(*icmph))) goto drop; icmph = (void *)(skb_network_header(skb) + ihl); @@ -215,6 +215,9 @@ static int tcf_nat(struct sk_buff *skb, struct tc_action *a, (icmph->type != ICMP_PARAMETERPROB)) break; + if (!pskb_may_pull(skb, ihl + sizeof(*icmph) + sizeof(*iph))) + goto drop; + iph = (void *)(icmph + 1); if (egress) addr = iph->daddr; diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 73affb8624f..8dc47f1d000 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -267,7 +267,7 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) * Run memory cache shrinker. */ static int -rpcauth_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) +rpcauth_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { LIST_HEAD(free); int res; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index af1c173be4a..a7ec5a8a238 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1594,8 +1594,8 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, /* Try to instantiate a bundle */ err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); - if (err < 0) { - if (err != -EAGAIN) + if (err <= 0) { + if (err != 0 && err != -EAGAIN) XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); return ERR_PTR(err); } @@ -1678,6 +1678,13 @@ xfrm_bundle_lookup(struct net *net, struct flowi *fl, u16 family, u8 dir, goto make_dummy_bundle; dst_hold(&xdst->u.dst); return oldflo; + } else if (new_xdst == NULL) { + num_xfrms = 0; + if (oldflo == NULL) + goto make_dummy_bundle; + xdst->num_xfrms = 0; + dst_hold(&xdst->u.dst); + return oldflo; } /* Kill the previous bundle */ @@ -1760,6 +1767,10 @@ restart: xfrm_pols_put(pols, num_pols); err = PTR_ERR(xdst); goto dropdst; + } else if (xdst == NULL) { + num_xfrms = 0; + drop_pols = num_pols; + goto no_transform; } spin_lock_bh(&xfrm_policy_sk_bundle_lock); diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index 31ac5538fe7..5da30eb6ad0 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -83,8 +83,8 @@ config SND_SOC_ALL_CODECS config SND_SOC_WM_HUBS tristate - default y if SND_SOC_WM8993=y - default m if SND_SOC_WM8993=m + default y if SND_SOC_WM8993=y || SND_SOC_WM8994=y + default m if SND_SOC_WM8993=m || SND_SOC_WM8994=m config SND_SOC_AC97_CODEC tristate diff --git a/sound/soc/codecs/wm8727.c b/sound/soc/codecs/wm8727.c index 1072621e93f..9d1df262813 100644 --- a/sound/soc/codecs/wm8727.c +++ b/sound/soc/codecs/wm8727.c @@ -127,6 +127,8 @@ static __devinit int wm8727_platform_probe(struct platform_device *pdev) goto err_codec; } + return 0; + err_codec: snd_soc_unregister_codec(codec); err: diff --git a/sound/soc/codecs/wm8776.c b/sound/soc/codecs/wm8776.c index 7e4a627b4c7..4e212ed62ea 100644 --- a/sound/soc/codecs/wm8776.c +++ b/sound/soc/codecs/wm8776.c @@ -94,7 +94,6 @@ SOC_DAPM_SINGLE("Bypass Switch", WM8776_OUTMUX, 2, 1, 0), static const struct snd_soc_dapm_widget wm8776_dapm_widgets[] = { SND_SOC_DAPM_INPUT("AUX"), -SND_SOC_DAPM_INPUT("AUX"), SND_SOC_DAPM_INPUT("AIN1"), SND_SOC_DAPM_INPUT("AIN2"), diff --git a/sound/soc/codecs/wm8988.c b/sound/soc/codecs/wm8988.c index 0417dae32e6..19ad590ca0b 100644 --- a/sound/soc/codecs/wm8988.c +++ b/sound/soc/codecs/wm8988.c @@ -885,7 +885,6 @@ static int wm8988_register(struct wm8988_priv *wm8988, ret = snd_soc_register_dai(&wm8988_dai); if (ret != 0) { dev_err(codec->dev, "Failed to register DAI: %d\n", ret); - snd_soc_unregister_codec(codec); goto err_codec; } diff --git a/sound/soc/sh/fsi.c b/sound/soc/sh/fsi.c index 3396a0db06b..ec4acac49eb 100644 --- a/sound/soc/sh/fsi.c +++ b/sound/soc/sh/fsi.c @@ -683,20 +683,15 @@ static int fsi_dai_startup(struct snd_pcm_substream *substream, /* clock inversion (CKG2) */ data = 0; - switch (SH_FSI_INVERSION_MASK & flags) { - case SH_FSI_LRM_INV: - data = 1 << 12; - break; - case SH_FSI_BRM_INV: - data = 1 << 8; - break; - case SH_FSI_LRS_INV: - data = 1 << 4; - break; - case SH_FSI_BRS_INV: - data = 1 << 0; - break; - } + if (SH_FSI_LRM_INV & flags) + data |= 1 << 12; + if (SH_FSI_BRM_INV & flags) + data |= 1 << 8; + if (SH_FSI_LRS_INV & flags) + data |= 1 << 4; + if (SH_FSI_BRS_INV & flags) + data |= 1 << 0; + fsi_reg_write(fsi, CKG2, data); /* do fmt, di fmt */ @@ -726,15 +721,15 @@ static int fsi_dai_startup(struct snd_pcm_substream *substream, break; case SH_FSI_FMT_TDM: msg = "TDM"; - data = CR_FMT(CR_TDM) | (fsi->chan - 1); fsi->chan = is_play ? SH_FSI_GET_CH_O(flags) : SH_FSI_GET_CH_I(flags); + data = CR_FMT(CR_TDM) | (fsi->chan - 1); break; case SH_FSI_FMT_TDM_DELAY: msg = "TDM Delay"; - data = CR_FMT(CR_TDM_D) | (fsi->chan - 1); fsi->chan = is_play ? SH_FSI_GET_CH_O(flags) : SH_FSI_GET_CH_I(flags); + data = CR_FMT(CR_TDM_D) | (fsi->chan - 1); break; default: dev_err(dai->dev, "unknown format.\n"); diff --git a/tools/perf/arch/sparc/Makefile b/tools/perf/arch/sparc/Makefile new file mode 100644 index 00000000000..15130b50dfe --- /dev/null +++ b/tools/perf/arch/sparc/Makefile @@ -0,0 +1,4 @@ +ifndef NO_DWARF +PERF_HAVE_DWARF_REGS := 1 +LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o +endif diff --git a/tools/perf/arch/sparc/util/dwarf-regs.c b/tools/perf/arch/sparc/util/dwarf-regs.c new file mode 100644 index 00000000000..0ab88483720 --- /dev/null +++ b/tools/perf/arch/sparc/util/dwarf-regs.c @@ -0,0 +1,43 @@ +/* + * Mapping of DWARF debug register numbers into register names. + * + * Copyright (C) 2010 David S. Miller <davem@davemloft.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <libio.h> +#include <dwarf-regs.h> + +#define SPARC_MAX_REGS 96 + +const char *sparc_regs_table[SPARC_MAX_REGS] = { + "%g0", "%g1", "%g2", "%g3", "%g4", "%g5", "%g6", "%g7", + "%o0", "%o1", "%o2", "%o3", "%o4", "%o5", "%sp", "%o7", + "%l0", "%l1", "%l2", "%l3", "%l4", "%l5", "%l6", "%l7", + "%i0", "%i1", "%i2", "%i3", "%i4", "%i5", "%fp", "%i7", + "%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7", + "%f8", "%f9", "%f10", "%f11", "%f12", "%f13", "%f14", "%f15", + "%f16", "%f17", "%f18", "%f19", "%f20", "%f21", "%f22", "%f23", + "%f24", "%f25", "%f26", "%f27", "%f28", "%f29", "%f30", "%f31", + "%f32", "%f33", "%f34", "%f35", "%f36", "%f37", "%f38", "%f39", + "%f40", "%f41", "%f42", "%f43", "%f44", "%f45", "%f46", "%f47", + "%f48", "%f49", "%f50", "%f51", "%f52", "%f53", "%f54", "%f55", + "%f56", "%f57", "%f58", "%f59", "%f60", "%f61", "%f62", "%f63", +}; + +/** + * get_arch_regstr() - lookup register name from it's DWARF register number + * @n: the DWARF register number + * + * get_arch_regstr() returns the name of the register in struct + * regdwarfnum_table from it's DWARF register number. If the register is not + * found in the table, this returns NULL; + */ +const char *get_arch_regstr(unsigned int n) +{ + return (n <= SPARC_MAX_REGS) ? sparc_regs_table[n] : NULL; +} |