From 86d56134f1b67d0c18025ba5cade95c048ed528d Mon Sep 17 00:00:00 2001 From: Michael Marineau Date: Thu, 10 Apr 2014 14:09:31 -0700 Subject: kobject: Make support for uevent_helper optional. Support for uevent_helper, aka hotplug, is not required on many systems these days but it can still be enabled via sysfs or sysctl. Reported-by: Darren Shepherd Signed-off-by: Michael Marineau Signed-off-by: Greg Kroah-Hartman --- kernel/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe3..bc966a8ffc3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -643,7 +643,7 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif - +#ifdef CONFIG_UEVENT_HELPER { .procname = "hotplug", .data = &uevent_helper, @@ -651,7 +651,7 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dostring, }, - +#endif #ifdef CONFIG_CHR_DEV_SG { .procname = "sg-big-buff", -- cgit v1.2.3-70-g09d2 From 3d7ee969bffcc984c8aeaffc6ac6816fd929ace1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 May 2014 12:19:32 -0700 Subject: x86, vdso: Clean up 32-bit vs 64-bit vdso params Rather than using 'vdso_enabled' and an awful #define, just call the parameters vdso32_enabled and vdso64_enabled. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/87913de56bdcbae3d93917938302fc369b05caee.1399317206.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/elf.h | 20 +++++++++++++------- arch/x86/um/vdso/vma.c | 2 +- arch/x86/vdso/vdso32-setup.c | 19 ++++++++----------- arch/x86/vdso/vma.c | 6 +++--- kernel/sysctl.c | 5 +++++ 5 files changed, 30 insertions(+), 22 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 2c71182d30e..e96df2c0dd6 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -75,7 +75,12 @@ typedef struct user_fxsr_struct elf_fpxregset_t; #include -extern unsigned int vdso_enabled; +#ifdef CONFIG_X86_64 +extern unsigned int vdso64_enabled; +#endif +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +extern unsigned int vdso32_enabled; +#endif /* * This is used to ensure we don't load something for the wrong architecture. @@ -269,9 +274,9 @@ extern int force_personality32; struct task_struct; -#define ARCH_DLINFO_IA32(vdso_enabled) \ +#define ARCH_DLINFO_IA32 \ do { \ - if (vdso_enabled) { \ + if (vdso32_enabled) { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } \ @@ -281,7 +286,7 @@ do { \ #define STACK_RND_MASK (0x7ff) -#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled) +#define ARCH_DLINFO ARCH_DLINFO_IA32 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ @@ -292,14 +297,15 @@ do { \ #define ARCH_DLINFO \ do { \ - if (vdso_enabled) \ + if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long)current->mm->context.vdso); \ } while (0) +/* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */ #define ARCH_DLINFO_X32 \ do { \ - if (vdso_enabled) \ + if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ (unsigned long)current->mm->context.vdso); \ } while (0) @@ -310,7 +316,7 @@ do { \ if (test_thread_flag(TIF_X32)) \ ARCH_DLINFO_X32; \ else \ - ARCH_DLINFO_IA32(sysctl_vsyscall32) + ARCH_DLINFO_IA32 #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index af91901babb..916cda4cd5b 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -12,7 +12,7 @@ #include #include -unsigned int __read_mostly vdso_enabled = 1; +static unsigned int __read_mostly vdso_enabled = 1; unsigned long um_vdso_addr; extern unsigned long task_size; diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 00348980a3a..5a657d93c6e 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -37,7 +37,6 @@ #endif #ifdef CONFIG_X86_64 -#define vdso_enabled sysctl_vsyscall32 #define arch_setup_additional_pages syscall32_setup_pages #endif @@ -45,13 +44,13 @@ * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? */ -unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; +unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT; -static int __init vdso_setup(char *s) +static int __init vdso32_setup(char *s) { - vdso_enabled = simple_strtoul(s, NULL, 0); + vdso32_enabled = simple_strtoul(s, NULL, 0); - if (vdso_enabled > 1) + if (vdso32_enabled > 1) pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); return 1; @@ -62,12 +61,10 @@ static int __init vdso_setup(char *s) * behavior on both 64-bit and 32-bit kernels. * On 32-bit kernels, vdso=[012] means the same thing. */ -__setup("vdso32=", vdso_setup); +__setup("vdso32=", vdso32_setup); #ifdef CONFIG_X86_32 -__setup_param("vdso=", vdso32_setup, vdso_setup, 0); - -EXPORT_SYMBOL_GPL(vdso_enabled); +__setup_param("vdso=", vdso_setup, vdso32_setup, 0); #endif static struct page **vdso32_pages; @@ -160,7 +157,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return x32_setup_additional_pages(bprm, uses_interp); #endif - if (vdso_enabled != 1) /* Other values all mean "disabled" */ + if (vdso32_enabled != 1) /* Other values all mean "disabled" */ return 0; down_write(&mm->mmap_sem); @@ -244,7 +241,7 @@ subsys_initcall(sysenter_setup); static struct ctl_table abi_table2[] = { { .procname = "vsyscall32", - .data = &sysctl_vsyscall32, + .data = &vdso32_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 1ad10261312..8b790398ed1 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -17,7 +17,7 @@ #include #if defined(CONFIG_X86_64) -unsigned int __read_mostly vdso_enabled = 1; +unsigned int __read_mostly vdso64_enabled = 1; DECLARE_VDSO_IMAGE(vdso); extern unsigned short vdso_sync_cpuid; @@ -160,7 +160,7 @@ static int setup_additional_pages(struct linux_binprm *bprm, unsigned long addr; int ret; - if (!vdso_enabled) + if (!vdso64_enabled) return 0; down_write(&mm->mmap_sem); @@ -203,7 +203,7 @@ int x32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) static __init int vdso_setup(char *s) { - vdso_enabled = simple_strtoul(s, NULL, 0); + vdso64_enabled = simple_strtoul(s, NULL, 0); return 0; } __setup("vdso=", vdso_setup); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe3..420d77afa8f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1418,8 +1418,13 @@ static struct ctl_table vm_table[] = { (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { .procname = "vdso_enabled", +#ifdef CONFIG_X86_32 + .data = &vdso32_enabled, + .maxlen = sizeof(vdso32_enabled), +#else .data = &vdso_enabled, .maxlen = sizeof(vdso_enabled), +#endif .mode = 0644, .proc_handler = proc_dointvec, .extra1 = &zero, -- cgit v1.2.3-70-g09d2 From 122ff243f5f104194750ecbc76d5946dd1eec934 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Mon, 12 May 2014 16:04:53 -0700 Subject: ipv4: make ip_local_reserved_ports per netns ip_local_port_range is already per netns, so should ip_local_reserved_ports be. And since it is none by default we don't actually need it when we don't enable CONFIG_SYSCTL. By the way, rename inet_is_reserved_local_port() to inet_is_local_reserved_port() Cc: "David S. Miller" Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/ip.h | 14 +++++++++++--- include/net/netns/ipv4.h | 4 ++++ kernel/sysctl.c | 4 ++-- net/ipv4/af_inet.c | 8 +------- net/ipv4/inet_connection_sock.c | 5 +---- net/ipv4/inet_hashtables.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 31 ++++++++++++++----------------- net/ipv4/udp.c | 2 +- net/sctp/socket.c | 5 +++-- 9 files changed, 38 insertions(+), 37 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/net/ip.h b/include/net/ip.h index 14c50a1650e..512bcd5daba 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -208,11 +208,19 @@ static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_o void inet_get_local_port_range(struct net *net, int *low, int *high); -extern unsigned long *sysctl_local_reserved_ports; -static inline int inet_is_reserved_local_port(int port) +#if CONFIG_SYSCTL +static inline int inet_is_local_reserved_port(struct net *net, int port) { - return test_bit(port, sysctl_local_reserved_ports); + if (!net->ipv4.sysctl_local_reserved_ports) + return 0; + return test_bit(port, net->ipv4.sysctl_local_reserved_ports); } +#else +static inline int inet_is_local_reserved_port(struct net *net, int port) +{ + return 0; +} +#endif extern int sysctl_ip_nonlocal_bind; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2f0cfad6666..aec5e12f9f1 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -84,6 +84,10 @@ struct netns_ipv4 { atomic_t dev_addr_genid; +#ifdef CONFIG_SYSCTL + unsigned long *sysctl_local_reserved_ports; +#endif + #ifdef CONFIG_IP_MROUTE #ifndef CONFIG_IP_MROUTE_MULTIPLE_TABLES struct mr_table *mrt; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe3..e36ae4b1572 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2501,11 +2501,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, bool first = 1; size_t left = *lenp; unsigned long bitmap_len = table->maxlen; - unsigned long *bitmap = (unsigned long *) table->data; + unsigned long *bitmap = *(unsigned long **) table->data; unsigned long *tmp_bitmap = NULL; char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; - if (!bitmap_len || !left || (*ppos && !write)) { + if (!bitmap || !bitmap_len || !left || (*ppos && !write)) { *lenp = 0; return 0; } diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 211c0cc6c3d..279132bcadd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1705,13 +1705,9 @@ static int __init inet_init(void) BUILD_BUG_ON(sizeof(struct inet_skb_parm) > FIELD_SIZEOF(struct sk_buff, cb)); - sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); - if (!sysctl_local_reserved_ports) - goto out; - rc = proto_register(&tcp_prot, 1); if (rc) - goto out_free_reserved_ports; + goto out; rc = proto_register(&udp_prot, 1); if (rc) @@ -1821,8 +1817,6 @@ out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: proto_unregister(&tcp_prot); -out_free_reserved_ports: - kfree(sysctl_local_reserved_ports); goto out; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 12e502cbfdc..14d02ea905b 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -29,9 +29,6 @@ const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif -unsigned long *sysctl_local_reserved_ports; -EXPORT_SYMBOL(sysctl_local_reserved_ports); - void inet_get_local_port_range(struct net *net, int *low, int *high) { unsigned int seq; @@ -113,7 +110,7 @@ again: smallest_size = -1; do { - if (inet_is_reserved_local_port(rover)) + if (inet_is_local_reserved_port(net, rover)) goto next_nolock; head = &hashinfo->bhash[inet_bhashfn(net, rover, hashinfo->bhash_size)]; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 8b9cf279450..83331f1b86a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -500,7 +500,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, local_bh_disable(); for (i = 1; i <= remaining; i++) { port = low + (i + offset) % remaining; - if (inet_is_reserved_local_port(port)) + if (inet_is_local_reserved_port(net, port)) continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a33b9fbc1d8..79a007c5255 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -436,13 +436,6 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "ip_local_reserved_ports", - .data = NULL, /* initialized in sysctl_ipv4_init */ - .maxlen = 65536, - .mode = 0644, - .proc_handler = proc_do_large_bitmap, - }, { .procname = "igmp_max_memberships", .data = &sysctl_igmp_max_memberships, @@ -824,6 +817,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = ipv4_local_port_range, }, + { + .procname = "ip_local_reserved_ports", + .data = &init_net.ipv4.sysctl_local_reserved_ports, + .maxlen = 65536, + .mode = 0644, + .proc_handler = proc_do_large_bitmap, + }, { .procname = "ip_no_pmtu_disc", .data = &init_net.ipv4.sysctl_ip_no_pmtu_disc, @@ -876,8 +876,14 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (net->ipv4.ipv4_hdr == NULL) goto err_reg; + net->ipv4.sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL); + if (!net->ipv4.sysctl_local_reserved_ports) + goto err_ports; + return 0; +err_ports: + unregister_net_sysctl_table(net->ipv4.ipv4_hdr); err_reg: if (!net_eq(net, &init_net)) kfree(table); @@ -889,6 +895,7 @@ static __net_exit void ipv4_sysctl_exit_net(struct net *net) { struct ctl_table *table; + kfree(net->ipv4.sysctl_local_reserved_ports); table = net->ipv4.ipv4_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.ipv4_hdr); kfree(table); @@ -902,16 +909,6 @@ static __net_initdata struct pernet_operations ipv4_sysctl_ops = { static __init int sysctl_ipv4_init(void) { struct ctl_table_header *hdr; - struct ctl_table *i; - - for (i = ipv4_table; i->procname; i++) { - if (strcmp(i->procname, "ip_local_reserved_ports") == 0) { - i->data = sysctl_local_reserved_ports; - break; - } - } - if (!i->procname) - return -EINVAL; hdr = register_net_sysctl(&init_net, "net/ipv4", ipv4_table); if (hdr == NULL) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 54ea0a3a48f..6729ea97a59 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -246,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, do { if (low <= snum && snum <= high && !test_bit(snum >> udptable->log, bitmap) && - !inet_is_reserved_local_port(snum)) + !inet_is_local_reserved_port(net, snum)) goto found; snum += rand; } while (snum != first); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index e37b2cbbf17..2af76eaba8f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5946,8 +5946,9 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) /* Search for an available port. */ int low, high, remaining, index; unsigned int rover; + struct net *net = sock_net(sk); - inet_get_local_port_range(sock_net(sk), &low, &high); + inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; rover = prandom_u32() % remaining + low; @@ -5955,7 +5956,7 @@ static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) rover++; if ((rover < low) || (rover > high)) rover = low; - if (inet_is_reserved_local_port(rover)) + if (inet_is_local_reserved_port(net, rover)) continue; index = sctp_phashfn(sock_net(sk), rover); head = &sctp_port_hashtable[index]; -- cgit v1.2.3-70-g09d2 From 8c7260c0d9a6fa327546af724fb48375df493326 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Fri, 16 May 2014 23:26:02 +0200 Subject: sparc64: fix sparse warning in tsb.c Fix following warning: tsb.c:290:5: warning: symbol 'sysctl_tsb_ratio' was not declared. Should it be static? Add extern declaration in asm/setup.h and remove local declaration in kernel/sysctl.c Signed-off-by: Sam Ravnborg Signed-off-by: David S. Miller --- arch/sparc/include/asm/setup.h | 1 + arch/sparc/mm/tsb.c | 1 + kernel/sysctl.c | 4 ---- 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/arch/sparc/include/asm/setup.h b/arch/sparc/include/asm/setup.h index b5f5686ca57..0c4f69528d0 100644 --- a/arch/sparc/include/asm/setup.h +++ b/arch/sparc/include/asm/setup.h @@ -52,6 +52,7 @@ unsigned long safe_compute_effective_address(struct pt_regs *, unsigned int); int handle_ldf_stq(u32 insn, struct pt_regs *regs); void handle_ld_nf(u32 insn, struct pt_regs *regs); +extern int sysctl_tsb_ratio; #endif void sun_do_break(void); diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c index f5d506fddda..33ca9360f59 100644 --- a/arch/sparc/mm/tsb.c +++ b/arch/sparc/mm/tsb.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe3..3c202d00f6e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -152,10 +152,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); #ifdef CONFIG_SPARC #endif -#ifdef CONFIG_SPARC64 -extern int sysctl_tsb_ratio; -#endif - #ifdef __hppa__ extern int pwrsw_enabled; #endif -- cgit v1.2.3-70-g09d2 From f88083005ab319abba5d0b2e4e997558245493c8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Jun 2014 14:37:17 -0700 Subject: sysctl: clean up char buffer arguments When writing to a sysctl string, each write, regardless of VFS position, began writing the string from the start. This meant the contents of the last write to the sysctl controlled the string contents instead of the first. This misbehavior was featured in an exploit against Chrome OS. While it's not in itself a vulnerability, it's a weirdness that isn't on the mind of most auditors: "This filter looks correct, the first line written would not be meaningful to sysctl" doesn't apply here, since the size of the write and the contents of the final write are what matter when writing to sysctls. This adds the sysctl kernel.sysctl_writes_strict to control the write behavior. The default (0) reports when VFS position is non-0 on a write, but retains legacy behavior, -1 disables the warning, and 1 enables the position-respecting behavior. The long-term plan here is to wait for userspace to be fixed in response to the new warning and to then switch the default kernel behavior to the new position-respecting behavior. This patch (of 4): The char buffer arguments are needlessly cast in weird places. Clean it up so things are easier to read. Signed-off-by: Kees Cook Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 40ce2d983b1..3e214beabbe 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1703,8 +1703,8 @@ int __init sysctl_init(void) #ifdef CONFIG_PROC_SYSCTL -static int _proc_do_string(void* data, int maxlen, int write, - void __user *buffer, +static int _proc_do_string(char *data, int maxlen, int write, + char __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -1730,7 +1730,7 @@ static int _proc_do_string(void* data, int maxlen, int write, len = maxlen-1; if(copy_from_user(data, buffer, len)) return -EFAULT; - ((char *) data)[len] = 0; + data[len] = 0; *ppos += *lenp; } else { len = strlen(data); @@ -1748,10 +1748,10 @@ static int _proc_do_string(void* data, int maxlen, int write, if (len > *lenp) len = *lenp; if (len) - if(copy_to_user(buffer, data, len)) + if (copy_to_user(buffer, data, len)) return -EFAULT; if (len < *lenp) { - if(put_user('\n', ((char __user *) buffer) + len)) + if (put_user('\n', buffer + len)) return -EFAULT; len++; } @@ -1781,8 +1781,8 @@ static int _proc_do_string(void* data, int maxlen, int write, int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return _proc_do_string(table->data, table->maxlen, write, - buffer, lenp, ppos); + return _proc_do_string((char *)(table->data), table->maxlen, write, + (char __user *)buffer, lenp, ppos); } static size_t proc_skip_spaces(char **buf) -- cgit v1.2.3-70-g09d2 From 2ca9bb456ada8bcbdc8f77f8fc78207653bbaa92 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Jun 2014 14:37:18 -0700 Subject: sysctl: refactor sysctl string writing logic Consolidate buffer length checking with new-line/end-of-line checking. Additionally, instead of reading user memory twice, just do the assignment during the loop. This change doesn't affect the potential races here. It was already possible to read a sysctl that was in the middle of a write. In both cases, the string will always be NULL terminated. The pre-existing race remains a problem to be solved. Signed-off-by: Kees Cook Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3e214beabbe..ac6847feaa8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1717,21 +1717,18 @@ static int _proc_do_string(char *data, int maxlen, int write, } if (write) { + /* Start writing from beginning of buffer. */ len = 0; + *ppos += *lenp; p = buffer; - while (len < *lenp) { + while ((p - buffer) < *lenp && len < maxlen - 1) { if (get_user(c, p++)) return -EFAULT; if (c == 0 || c == '\n') break; - len++; + data[len++] = c; } - if (len >= maxlen) - len = maxlen-1; - if(copy_from_user(data, buffer, len)) - return -EFAULT; data[len] = 0; - *ppos += *lenp; } else { len = strlen(data); if (len > maxlen) -- cgit v1.2.3-70-g09d2 From f4aacea2f5d1a5f7e3154e967d70cf3f711bcd61 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 6 Jun 2014 14:37:19 -0700 Subject: sysctl: allow for strict write position handling When writing to a sysctl string, each write, regardless of VFS position, begins writing the string from the start. This means the contents of the last write to the sysctl controls the string contents instead of the first: open("/proc/sys/kernel/modprobe", O_WRONLY) = 1 write(1, "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"..., 4096) = 4096 write(1, "/bin/true", 9) = 9 close(1) = 0 $ cat /proc/sys/kernel/modprobe /bin/true Expected behaviour would be to have the sysctl be "AAAA..." capped at maxlen (in this case KMOD_PATH_LEN: 256), instead of truncating to the contents of the second write. Similarly, multiple short writes would not append to the sysctl. The old behavior is unlike regular POSIX files enough that doing audits of software that interact with sysctls can end up in unexpected or dangerous situations. For example, "as long as the input starts with a trusted path" turns out to be an insufficient filter, as what must also happen is for the input to be entirely contained in a single write syscall -- not a common consideration, especially for high level tools. This provides kernel.sysctl_writes_strict as a way to make this behavior act in a less surprising manner for strings, and disallows non-zero file position when writing numeric sysctls (similar to what is already done when reading from non-zero file positions). For now, the default (0) is to warn about non-zero file position use, but retain the legacy behavior. Setting this to -1 disables the warning, and setting this to 1 enables the file position respecting behavior. [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: move misplaced hunk, per Randy] Signed-off-by: Kees Cook Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 21 +++++++++++++ kernel/sysctl.c | 69 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 88 insertions(+), 2 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 9886c3d57fc..708bb7f1b7e 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -77,6 +77,7 @@ show up in /proc/sys/kernel: - shmmni - stop-a [ SPARC only ] - sysrq ==> Documentation/sysrq.txt +- sysctl_writes_strict - tainted - threads-max - unknown_nmi_panic @@ -762,6 +763,26 @@ without users and with a dead originative process will be destroyed. ============================================================== +sysctl_writes_strict: + +Control how file position affects the behavior of updating sysctl values +via the /proc/sys interface: + + -1 - Legacy per-write sysctl value handling, with no printk warnings. + Each write syscall must fully contain the sysctl value to be + written, and multiple writes on the same sysctl file descriptor + will rewrite the sysctl value, regardless of file position. + 0 - (default) Same behavior as above, but warn about processes that + perform writes to a sysctl file descriptor when the file position + is not 0. + 1 - Respect file position when writing sysctl strings. Multiple writes + will append to the sysctl value buffer. Anything past the max length + of the sysctl value buffer will be ignored. Writes to numeric sysctl + entries must always be at file position 0 and the value must be + fully contained in the buffer sent in the write syscall. + +============================================================== + tainted: Non-zero if the kernel has been tainted. Numeric values, which diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ac6847feaa8..7a910b9081e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -173,6 +173,13 @@ extern int no_unaligned_warning; #endif #ifdef CONFIG_PROC_SYSCTL + +#define SYSCTL_WRITES_LEGACY -1 +#define SYSCTL_WRITES_WARN 0 +#define SYSCTL_WRITES_STRICT 1 + +static int sysctl_writes_strict = SYSCTL_WRITES_WARN; + static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); static int proc_taint(struct ctl_table *table, int write, @@ -495,6 +502,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_taint, }, + { + .procname = "sysctl_writes_strict", + .data = &sysctl_writes_strict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &neg_one, + .extra2 = &one, + }, #endif #ifdef CONFIG_LATENCYTOP { @@ -1717,8 +1733,20 @@ static int _proc_do_string(char *data, int maxlen, int write, } if (write) { - /* Start writing from beginning of buffer. */ - len = 0; + if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { + /* Only continue writes not past the end of buffer. */ + len = strlen(data); + if (len > maxlen - 1) + len = maxlen - 1; + + if (*ppos > len) + return 0; + len = *ppos; + } else { + /* Start writing from beginning of buffer. */ + len = 0; + } + *ppos += *lenp; p = buffer; while ((p - buffer) < *lenp && len < maxlen - 1) { @@ -1758,6 +1786,14 @@ static int _proc_do_string(char *data, int maxlen, int write, return 0; } +static void warn_sysctl_write(struct ctl_table *table) +{ + pr_warn_once("%s wrote to %s when file position was not 0!\n" + "This will not be supported in the future. To silence this\n" + "warning, set kernel.sysctl_writes_strict = -1\n", + current->comm, table->procname); +} + /** * proc_dostring - read a string sysctl * @table: the sysctl table @@ -1778,6 +1814,9 @@ static int _proc_do_string(char *data, int maxlen, int write, int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN) + warn_sysctl_write(table); + return _proc_do_string((char *)(table->data), table->maxlen, write, (char __user *)buffer, lenp, ppos); } @@ -1953,6 +1992,18 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, conv = do_proc_dointvec_conv; if (write) { + if (*ppos) { + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + goto out; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + break; + default: + break; + } + } + if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; page = __get_free_page(GFP_TEMPORARY); @@ -2010,6 +2061,7 @@ free: return err ? : -EINVAL; } *lenp -= left; +out: *ppos += *lenp; return err; } @@ -2202,6 +2254,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int left = *lenp; if (write) { + if (*ppos) { + switch (sysctl_writes_strict) { + case SYSCTL_WRITES_STRICT: + goto out; + case SYSCTL_WRITES_WARN: + warn_sysctl_write(table); + break; + default: + break; + } + } + if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; page = __get_free_page(GFP_TEMPORARY); @@ -2257,6 +2321,7 @@ free: return err ? : -EINVAL; } *lenp -= left; +out: *ppos += *lenp; return err; } -- cgit v1.2.3-70-g09d2 From 6f8fd1d77e64dd6be1075041cdfd8de4c2ca4e2b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 6 Jun 2014 14:38:08 -0700 Subject: sysctl: convert use of typedef ctl_table to struct ctl_table This typedef is unnecessary and should just be removed. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- kernel/utsname_sysctl.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7a910b9081e..db19e3e2aa4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -202,7 +202,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE; -static int sysrq_sysctl_handler(ctl_table *table, int write, +static int sysrq_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 6fbe811c7ad..c8eac43267e 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -17,7 +17,7 @@ #ifdef CONFIG_PROC_SYSCTL -static void *get_uts(ctl_table *table, int write) +static void *get_uts(struct ctl_table *table, int write) { char *which = table->data; struct uts_namespace *uts_ns; @@ -32,7 +32,7 @@ static void *get_uts(ctl_table *table, int write) return which; } -static void put_uts(ctl_table *table, int write, void *which) +static void put_uts(struct ctl_table *table, int write, void *which) { if (!write) up_read(&uts_sem); @@ -44,7 +44,7 @@ static void put_uts(ctl_table *table, int write, void *which) * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_do_uts_string(ctl_table *table, int write, +static int proc_do_uts_string(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; -- cgit v1.2.3-70-g09d2 From 7cd2b0a34ab8e4db971920eef8982f985441adfb Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 23 Jun 2014 13:22:04 -0700 Subject: mm, pcp: allow restoring percpu_pagelist_fraction default Oleg reports a division by zero error on zero-length write() to the percpu_pagelist_fraction sysctl: divide error: 0000 [#1] SMP DEBUG_PAGEALLOC CPU: 1 PID: 9142 Comm: badarea_io Not tainted 3.15.0-rc2-vm-nfs+ #19 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff8800d5aeb6e0 ti: ffff8800d87a2000 task.ti: ffff8800d87a2000 RIP: 0010: percpu_pagelist_fraction_sysctl_handler+0x84/0x120 RSP: 0018:ffff8800d87a3e78 EFLAGS: 00010246 RAX: 0000000000000f89 RBX: ffff88011f7fd000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000010 RBP: ffff8800d87a3e98 R08: ffffffff81d002c8 R09: ffff8800d87a3f50 R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000060 R13: ffffffff81c3c3e0 R14: ffffffff81cfddf8 R15: ffff8801193b0800 FS: 00007f614f1e9740(0000) GS:ffff88011f440000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f614f1fa000 CR3: 00000000d9291000 CR4: 00000000000006e0 Call Trace: proc_sys_call_handler+0xb3/0xc0 proc_sys_write+0x14/0x20 vfs_write+0xba/0x1e0 SyS_write+0x46/0xb0 tracesys+0xe1/0xe6 However, if the percpu_pagelist_fraction sysctl is set by the user, it is also impossible to restore it to the kernel default since the user cannot write 0 to the sysctl. This patch allows the user to write 0 to restore the default behavior. It still requires a fraction equal to or larger than 8, however, as stated by the documentation for sanity. If a value in the range [1, 7] is written, the sysctl will return EINVAL. This successfully solves the divide by zero issue at the same time. Signed-off-by: David Rientjes Reported-by: Oleg Drokin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 3 ++- kernel/sysctl.c | 3 +-- mm/page_alloc.c | 40 ++++++++++++++++++++++++++++------------ 3 files changed, 31 insertions(+), 15 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index bd4b34c0373..4415aa91568 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -702,7 +702,8 @@ The batch value of each per cpu pagelist is also updated as a result. It is set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) The initial value is zero. Kernel does not use this value at boot time to set -the high water marks for each per cpu page list. +the high water marks for each per cpu page list. If the user writes '0' to this +sysctl, it will revert to this default behavior. ============================================================== diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7de6555cfea..075d1903138 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; static int minolduid; -static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; @@ -1317,7 +1316,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(percpu_pagelist_fraction), .mode = 0644, .proc_handler = percpu_pagelist_fraction_sysctl_handler, - .extra1 = &min_percpu_pagelist_fract, + .extra1 = &zero, }, #ifdef CONFIG_MMU { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f59fa29eda..20d17f8266f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -69,6 +69,7 @@ /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); +#define MIN_PERCPU_PAGELIST_FRACTION (8) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); @@ -4145,7 +4146,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif -static int __meminit zone_batchsize(struct zone *zone) +static int zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU int batch; @@ -4261,8 +4262,8 @@ static void pageset_set_high(struct per_cpu_pageset *p, pageset_update(&p->pcp, high, batch); } -static void __meminit pageset_set_high_and_batch(struct zone *zone, - struct per_cpu_pageset *pcp) +static void pageset_set_high_and_batch(struct zone *zone, + struct per_cpu_pageset *pcp) { if (percpu_pagelist_fraction) pageset_set_high(pcp, @@ -5881,23 +5882,38 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; - unsigned int cpu; + int old_percpu_pagelist_fraction; int ret; + mutex_lock(&pcp_batch_high_lock); + old_percpu_pagelist_fraction = percpu_pagelist_fraction; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); - if (!write || (ret < 0)) - return ret; + if (!write || ret < 0) + goto out; + + /* Sanity checking to avoid pcp imbalance */ + if (percpu_pagelist_fraction && + percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) { + percpu_pagelist_fraction = old_percpu_pagelist_fraction; + ret = -EINVAL; + goto out; + } + + /* No change? */ + if (percpu_pagelist_fraction == old_percpu_pagelist_fraction) + goto out; - mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { - unsigned long high; - high = zone->managed_pages / percpu_pagelist_fraction; + unsigned int cpu; + for_each_possible_cpu(cpu) - pageset_set_high(per_cpu_ptr(zone->pageset, cpu), - high); + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); } +out: mutex_unlock(&pcp_batch_high_lock); - return 0; + return ret; } int hashdist = HASHDIST_DEFAULT; -- cgit v1.2.3-70-g09d2 From ed235875e2ca983197831337a986f0517074e1a0 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Mon, 23 Jun 2014 13:22:05 -0700 Subject: kernel/watchdog.c: print traces for all cpus on lockup detection A 'softlockup' is defined as a bug that causes the kernel to loop in kernel mode for more than a predefined period to time, without giving other tasks a chance to run. Currently, upon detection of this condition by the per-cpu watchdog task, debug information (including a stack trace) is sent to the system log. On some occasions, we have observed that the "victim" rather than the actual "culprit" (i.e. the owner/holder of the contended resource) is reported to the user. Often this information has proven to be insufficient to assist debugging efforts. To avoid loss of useful debug information, for architectures which support NMI, this patch makes it possible to improve soft lockup reporting. This is accomplished by issuing an NMI to each cpu to obtain a stack trace. If NMI is not supported we just revert back to the old method. A sysctl and boot-time parameter is available to toggle this feature. [dzickus@redhat.com: add CONFIG_SMP in certain areas] [akpm@linux-foundation.org: additional CONFIG_SMP=n optimisations] [mq@suse.cz: fix warning] Signed-off-by: Aaron Tomlin Signed-off-by: Don Zickus Cc: David S. Miller Cc: Mateusz Guzik Cc: Oleg Nesterov Signed-off-by: Jan Moskyto Matejka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 5 +++++ Documentation/sysctl/kernel.txt | 17 ++++++++++++++++ include/linux/nmi.h | 1 + kernel/sysctl.c | 11 +++++++++++ kernel/watchdog.c | 39 +++++++++++++++++++++++++++++++++++++ 5 files changed, 73 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 884904975d0..c1b9aa8c5a5 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3130,6 +3130,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. [KNL] Should the soft-lockup detector generate panics. Format: + softlockup_all_cpu_backtrace= + [KNL] Should the soft-lockup detector generate + backtraces on all cpus. + Format: + sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/laptops/sonypi.txt diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 708bb7f1b7e..c14374e7177 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -75,6 +75,7 @@ show up in /proc/sys/kernel: - shmall - shmmax [ sysv ipc ] - shmmni +- softlockup_all_cpu_backtrace - stop-a [ SPARC only ] - sysrq ==> Documentation/sysrq.txt - sysctl_writes_strict @@ -783,6 +784,22 @@ via the /proc/sys interface: ============================================================== +softlockup_all_cpu_backtrace: + +This value controls the soft lockup detector thread's behavior +when a soft lockup condition is detected as to whether or not +to gather further debug information. If enabled, each cpu will +be issued an NMI and instructed to capture stack trace. + +This feature is only applicable for architectures which support +NMI. + +0: do nothing. This is the default behavior. + +1: on detection capture more debug information. + +============================================================== + tainted: Non-zero if the kernel has been tainted. Numeric values, which diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a17ab6398d7..447775ee2c4 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -57,6 +57,7 @@ int hw_nmi_is_cpu_stuck(struct pt_regs *); u64 hw_nmi_get_sample_period(int watchdog_thresh); extern int watchdog_user_enabled; extern int watchdog_thresh; +extern int sysctl_softlockup_all_cpu_backtrace; struct ctl_table; extern int proc_dowatchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 075d1903138..75b22e22a72 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -860,6 +860,17 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_SMP + { + .procname = "softlockup_all_cpu_backtrace", + .data = &sysctl_softlockup_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* CONFIG_SMP */ { .procname = "nmi_watchdog", .data = &watchdog_user_enabled, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 30e482240da..c3319bd1b04 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -31,6 +31,12 @@ int watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; +#ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; +#else +#define sysctl_softlockup_all_cpu_backtrace 0 +#endif + static int __read_mostly watchdog_running; static u64 __read_mostly sample_period; @@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif +static unsigned long soft_lockup_nmi_warn; /* boot commands */ /* @@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str) } __setup("nosoftlockup", nosoftlockup_setup); /* */ +#ifdef CONFIG_SMP +static int __init softlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_softlockup_all_cpu_backtrace = + !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); +#endif /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- @@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); struct pt_regs *regs = get_irq_regs(); int duration; + int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; /* kick the hardlockup detector */ watchdog_interrupt_count(); @@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) if (__this_cpu_read(soft_watchdog_warn) == true) return HRTIMER_RESTART; + if (softlockup_all_cpu_backtrace) { + /* Prevent multiple soft-lockup reports if one cpu is already + * engaged in dumping cpu back traces + */ + if (test_and_set_bit(0, &soft_lockup_nmi_warn)) { + /* Someone else will report us. Let's give up */ + __this_cpu_write(soft_watchdog_warn, true); + return HRTIMER_RESTART; + } + } + printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); @@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) else dump_stack(); + if (softlockup_all_cpu_backtrace) { + /* Avoid generating two back traces for current + * given that one is already made above + */ + trigger_allbutself_cpu_backtrace(); + + clear_bit(0, &soft_lockup_nmi_warn); + /* Barrier to sync with other cpus */ + smp_mb__after_atomic(); + } + if (softlockup_panic) panic("softlockup: hung tasks"); __this_cpu_write(soft_watchdog_warn, true); -- cgit v1.2.3-70-g09d2