summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks103
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/audit_tree.c19
-rw-r--r--kernel/cpu.c21
-rw-r--r--kernel/debug/debug_core.c14
-rw-r--r--kernel/debug/kdb/kdb_debugger.c4
-rw-r--r--kernel/debug/kdb/kdb_io.c11
-rw-r--r--kernel/debug/kdb/kdb_main.c46
-rw-r--r--kernel/events/callchain.c35
-rw-r--r--kernel/events/core.c308
-rw-r--r--kernel/events/hw_breakpoint.c11
-rw-r--r--kernel/events/internal.h85
-rw-r--r--kernel/events/ring_buffer.c10
-rw-r--r--kernel/events/uprobes.c248
-rw-r--r--kernel/fork.c10
-rw-r--r--kernel/futex.c17
-rw-r--r--kernel/irq/chip.c1
-rw-r--r--kernel/irq/dummychip.c2
-rw-r--r--kernel/irq/manage.c15
-rw-r--r--kernel/kprobes.c247
-rw-r--r--kernel/kthread.c185
-rw-r--r--kernel/lockdep.c39
-rw-r--r--kernel/pid_namespace.c6
-rw-r--r--kernel/power/suspend.c3
-rw-r--r--kernel/printk.c2
-rw-r--r--kernel/rcupdate.c4
-rw-r--r--kernel/rcutiny.c33
-rw-r--r--kernel/rcutiny_plugin.h10
-rw-r--r--kernel/rcutorture.c159
-rw-r--r--kernel/rcutree.c916
-rw-r--r--kernel/rcutree.h50
-rw-r--r--kernel/rcutree_plugin.h597
-rw-r--r--kernel/rcutree_trace.c22
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/core.c766
-rw-r--r--kernel/sched/cpupri.c10
-rw-r--r--kernel/sched/cputime.c530
-rw-r--r--kernel/sched/fair.c158
-rw-r--r--kernel/sched/features.h10
-rw-r--r--kernel/sched/rt.c19
-rw-r--r--kernel/sched/sched.h78
-rw-r--r--kernel/sched/stop_task.c22
-rw-r--r--kernel/signal.c18
-rw-r--r--kernel/smpboot.c233
-rw-r--r--kernel/smpboot.h4
-rw-r--r--kernel/softirq.c117
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/task_work.c112
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-sched.c7
-rw-r--r--kernel/time/timekeeping.c455
-rw-r--r--kernel/timer.c117
-rw-r--r--kernel/trace/Kconfig10
-rw-r--r--kernel/trace/Makefile8
-rw-r--r--kernel/trace/ftrace.c322
-rw-r--r--kernel/trace/ring_buffer.c4
-rw-r--r--kernel/trace/trace.c12
-rw-r--r--kernel/trace/trace.h3
-rw-r--r--kernel/trace/trace_event_perf.c5
-rw-r--r--kernel/trace/trace_events.c116
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_functions.c14
-rw-r--r--kernel/trace/trace_functions_graph.c5
-rw-r--r--kernel/trace/trace_irqsoff.c5
-rw-r--r--kernel/trace/trace_kprobe.c6
-rw-r--r--kernel/trace/trace_sched_wakeup.c5
-rw-r--r--kernel/trace/trace_selftest.c304
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_syscalls.c10
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/watchdog.c280
-rw-r--r--kernel/workqueue.c147
73 files changed, 4296 insertions, 2876 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 2251882daf5..44511d100ea 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -87,6 +87,9 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQ
config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
bool
+config UNINLINE_SPIN_UNLOCK
+ bool
+
#
# lock_* functions are inlined when:
# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y
@@ -103,100 +106,120 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y
#
+if !DEBUG_SPINLOCK
+
config INLINE_SPIN_TRYLOCK
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_SPIN_TRYLOCK
config INLINE_SPIN_TRYLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_SPIN_TRYLOCK_BH
config INLINE_SPIN_LOCK
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK
config INLINE_SPIN_LOCK_BH
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_SPIN_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_BH
config INLINE_SPIN_LOCK_IRQ
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_SPIN_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQ
config INLINE_SPIN_LOCK_IRQSAVE
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_SPIN_LOCK_IRQSAVE
-
-config UNINLINE_SPIN_UNLOCK
- bool
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQSAVE
config INLINE_SPIN_UNLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_SPIN_UNLOCK_BH
config INLINE_SPIN_UNLOCK_IRQ
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH
config INLINE_SPIN_UNLOCK_IRQRESTORE
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE
config INLINE_READ_TRYLOCK
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_READ_TRYLOCK
config INLINE_READ_LOCK
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK
config INLINE_READ_LOCK_BH
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_READ_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_BH
config INLINE_READ_LOCK_IRQ
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_READ_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQ
config INLINE_READ_LOCK_IRQSAVE
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_READ_LOCK_IRQSAVE
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQSAVE
config INLINE_READ_UNLOCK
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK
config INLINE_READ_UNLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_READ_UNLOCK_BH
config INLINE_READ_UNLOCK_IRQ
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH
config INLINE_READ_UNLOCK_IRQRESTORE
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_READ_UNLOCK_IRQRESTORE
config INLINE_WRITE_TRYLOCK
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK
+ def_bool y
+ depends on ARCH_INLINE_WRITE_TRYLOCK
config INLINE_WRITE_LOCK
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK
config INLINE_WRITE_LOCK_BH
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_WRITE_LOCK_BH
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_BH
config INLINE_WRITE_LOCK_IRQ
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_WRITE_LOCK_IRQ
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQ
config INLINE_WRITE_LOCK_IRQSAVE
- def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
- ARCH_INLINE_WRITE_LOCK_IRQSAVE
+ def_bool y
+ depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQSAVE
config INLINE_WRITE_UNLOCK
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK
config INLINE_WRITE_UNLOCK_BH
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH
+ def_bool y
+ depends on ARCH_INLINE_WRITE_UNLOCK_BH
config INLINE_WRITE_UNLOCK_IRQ
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH)
+ def_bool y
+ depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH
config INLINE_WRITE_UNLOCK_IRQRESTORE
- def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+ def_bool y
+ depends on ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
+
+endif
config MUTEX_SPIN_ON_OWNER
- def_bool SMP && !DEBUG_MUTEXES
+ def_bool y
+ depends on SMP && !DEBUG_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index c0cc67ad764..5404911eaee 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
notifier.o ksysfs.o cred.o \
- async.o range.o groups.o lglock.o
+ async.o range.o groups.o lglock.o smpboot.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
@@ -46,7 +46,6 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
-obj-$(CONFIG_SMP) += smpboot.o
ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
@@ -98,7 +97,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
-obj-$(CONFIG_X86_DS) += trace/
+obj-$(CONFIG_TRACE_CLOCK) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 3a5ca582ba1..ed206fd88cc 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -250,7 +250,6 @@ static void untag_chunk(struct node *p)
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
fsnotify_destroy_mark(entry);
- fsnotify_put_mark(entry);
goto out;
}
@@ -259,7 +258,7 @@ static void untag_chunk(struct node *p)
fsnotify_duplicate_mark(&new->mark, entry);
if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
- free_chunk(new);
+ fsnotify_put_mark(&new->mark);
goto Fallback;
}
@@ -293,7 +292,7 @@ static void untag_chunk(struct node *p)
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
fsnotify_destroy_mark(entry);
- fsnotify_put_mark(entry);
+ fsnotify_put_mark(&new->mark); /* drop initial reference */
goto out;
Fallback:
@@ -322,7 +321,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
entry = &chunk->mark;
if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) {
- free_chunk(chunk);
+ fsnotify_put_mark(entry);
return -ENOSPC;
}
@@ -347,6 +346,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
insert_hash(chunk);
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
+ fsnotify_put_mark(entry); /* drop initial reference */
return 0;
}
@@ -396,7 +396,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
fsnotify_duplicate_mark(chunk_entry, old_entry);
if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) {
spin_unlock(&old_entry->lock);
- free_chunk(chunk);
+ fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
return -ENOSPC;
}
@@ -444,8 +444,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
fsnotify_destroy_mark(old_entry);
+ fsnotify_put_mark(chunk_entry); /* drop initial reference */
fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
- fsnotify_put_mark(old_entry); /* and kill it */
return 0;
}
@@ -916,7 +916,12 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark);
evict_chunk(chunk);
- fsnotify_put_mark(entry);
+
+ /*
+ * We are guaranteed to have at least one reference to the mark from
+ * either the inode or the caller of fsnotify_destroy_mark().
+ */
+ BUG_ON(atomic_read(&entry->refcnt) < 1);
}
static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 14d32588ccc..f560598807c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -280,12 +280,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
__func__, cpu);
goto out_release;
}
+ smpboot_park_threads(cpu);
err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
+ smpboot_unpark_threads(cpu);
cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
-
goto out_release;
}
BUG_ON(cpu_online(cpu));
@@ -354,6 +355,10 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
goto out;
}
+ ret = smpboot_create_threads(cpu);
+ if (ret)
+ goto out;
+
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
@@ -368,6 +373,9 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
goto out_notify;
BUG_ON(!cpu_online(cpu));
+ /* Wake the per cpu threads */
+ smpboot_unpark_threads(cpu);
+
/* Now call notifier in preparation. */
cpu_notify(CPU_ONLINE | mod, hcpu);
@@ -439,14 +447,6 @@ EXPORT_SYMBOL_GPL(cpu_up);
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
-void __weak arch_disable_nonboot_cpus_begin(void)
-{
-}
-
-void __weak arch_disable_nonboot_cpus_end(void)
-{
-}
-
int disable_nonboot_cpus(void)
{
int cpu, first_cpu, error = 0;
@@ -458,7 +458,6 @@ int disable_nonboot_cpus(void)
* with the userspace trying to use the CPU hotplug at the same time
*/
cpumask_clear(frozen_cpus);
- arch_disable_nonboot_cpus_begin();
printk("Disabling non-boot CPUs ...\n");
for_each_online_cpu(cpu) {
@@ -474,8 +473,6 @@ int disable_nonboot_cpus(void)
}
}
- arch_disable_nonboot_cpus_end();
-
if (!error) {
BUG_ON(num_online_cpus() > 1);
/* Make sure the CPUs won't be enabled by someone else */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0557f24c6bc..17e073c309e 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -672,6 +672,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
{
struct kgdb_state kgdb_var;
struct kgdb_state *ks = &kgdb_var;
+ int ret = 0;
+
+ if (arch_kgdb_ops.enable_nmi)
+ arch_kgdb_ops.enable_nmi(0);
ks->cpu = raw_smp_processor_id();
ks->ex_vector = evector;
@@ -681,11 +685,15 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
ks->linux_regs = regs;
if (kgdb_reenter_check(ks))
- return 0; /* Ouch, double exception ! */
+ goto out; /* Ouch, double exception ! */
if (kgdb_info[ks->cpu].enter_kgdb != 0)
- return 0;
+ goto out;
- return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+ ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
+out:
+ if (arch_kgdb_ops.enable_nmi)
+ arch_kgdb_ops.enable_nmi(1);
+ return ret;
}
int kgdb_nmicallback(int cpu, void *regs)
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 8b68ce78ff1..be7b33b73d3 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -12,6 +12,7 @@
#include <linux/kdb.h>
#include <linux/kdebug.h>
#include <linux/export.h>
+#include <linux/hardirq.h>
#include "kdb_private.h"
#include "../debug_core.h"
@@ -52,6 +53,9 @@ int kdb_stub(struct kgdb_state *ks)
if (atomic_read(&kgdb_setting_breakpoint))
reason = KDB_REASON_KEYBOARD;
+ if (in_nmi())
+ reason = KDB_REASON_NMI;
+
for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
reason = KDB_REASON_BREAK;
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index bb9520f0f6f..0a69d2adc4f 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -715,9 +715,6 @@ kdb_printit:
/* check for having reached the LINES number of printed lines */
if (kdb_nextline == linecount) {
char buf1[16] = "";
-#if defined(CONFIG_SMP)
- char buf2[32];
-#endif
/* Watch out for recursion here. Any routine that calls
* kdb_printf will come back through here. And kdb_read
@@ -732,14 +729,6 @@ kdb_printit:
if (moreprompt == NULL)
moreprompt = "more> ";
-#if defined(CONFIG_SMP)
- if (strchr(moreprompt, '%')) {
- sprintf(buf2, moreprompt, get_cpu());
- put_cpu();
- moreprompt = buf2;
- }
-#endif
-
kdb_input_flush();
c = console_drivers;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 1f91413edb8..1261dc7eaeb 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -21,6 +21,7 @@
#include <linux/smp.h>
#include <linux/utsname.h>
#include <linux/vmalloc.h>
+#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/init.h>
@@ -139,11 +140,10 @@ static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
static char *__env[] = {
#if defined(CONFIG_SMP)
"PROMPT=[%d]kdb> ",
- "MOREPROMPT=[%d]more> ",
#else
"PROMPT=kdb> ",
- "MOREPROMPT=more> ",
#endif
+ "MOREPROMPT=more> ",
"RADIX=16",
"MDCOUNT=8", /* lines of md output */
KDB_PLATFORM_ENV,
@@ -1236,18 +1236,6 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
*cmdbuf = '\0';
*(cmd_hist[cmd_head]) = '\0';
- if (KDB_FLAG(ONLY_DO_DUMP)) {
- /* kdb is off but a catastrophic error requires a dump.
- * Take the dump and reboot.
- * Turn on logging so the kdb output appears in the log
- * buffer in the dump.
- */
- const char *setargs[] = { "set", "LOGGING", "1" };
- kdb_set(2, setargs);
- kdb_reboot(0, NULL);
- /*NOTREACHED*/
- }
-
do_full_getstr:
#if defined(CONFIG_SMP)
snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
@@ -2120,6 +2108,32 @@ static int kdb_dmesg(int argc, const char **argv)
return 0;
}
#endif /* CONFIG_PRINTK */
+
+/* Make sure we balance enable/disable calls, must disable first. */
+static atomic_t kdb_nmi_disabled;
+
+static int kdb_disable_nmi(int argc, const char *argv[])
+{
+ if (atomic_read(&kdb_nmi_disabled))
+ return 0;
+ atomic_set(&kdb_nmi_disabled, 1);
+ arch_kgdb_ops.enable_nmi(0);
+ return 0;
+}
+
+static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp)
+{
+ if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0))
+ return -EINVAL;
+ arch_kgdb_ops.enable_nmi(1);
+ return 0;
+}
+
+static const struct kernel_param_ops kdb_param_ops_enable_nmi = {
+ .set = kdb_param_enable_nmi,
+};
+module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600);
+
/*
* kdb_cpu - This function implements the 'cpu' command.
* cpu [<cpunum>]
@@ -2864,6 +2878,10 @@ static void __init kdb_inittab(void)
kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
"Display syslog buffer", 0, KDB_REPEAT_NONE);
#endif
+ if (arch_kgdb_ops.enable_nmi) {
+ kdb_register_repeat("disable_nmi", kdb_disable_nmi, "",
+ "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE);
+ }
kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
"Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 6581a040f39..c77206184b8 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -153,11 +153,17 @@ put_callchain_entry(int rctx)
put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
}
-struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
int rctx;
struct perf_callchain_entry *entry;
+ int kernel = !event->attr.exclude_callchain_kernel;
+ int user = !event->attr.exclude_callchain_user;
+
+ if (!kernel && !user)
+ return NULL;
entry = get_callchain_entry(&rctx);
if (rctx == -1)
@@ -168,18 +174,29 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
entry->nr = 0;
- if (!user_mode(regs)) {
+ if (kernel && !user_mode(regs)) {
perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
perf_callchain_kernel(entry, regs);
- if (current->mm)
- regs = task_pt_regs(current);
- else
- regs = NULL;
}
- if (regs) {
- perf_callchain_store(entry, PERF_CONTEXT_USER);
- perf_callchain_user(entry, regs);
+ if (user) {
+ if (!user_mode(regs)) {
+ if (current->mm)
+ regs = task_pt_regs(current);
+ else
+ regs = NULL;
+ }
+
+ if (regs) {
+ /*
+ * Disallow cross-task user callchains.
+ */
+ if (event->ctx->task && event->ctx->task != current)
+ goto exit_put;
+
+ perf_callchain_store(entry, PERF_CONTEXT_USER);
+ perf_callchain_user(entry, regs);
+ }
}
exit_put:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f1cf0edeb39..7b9df353ba1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,7 @@
#include <linux/perf_event.h>
#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
+#include <linux/mm_types.h>
#include "internal.h"
@@ -1253,7 +1254,7 @@ retry:
/*
* Cross CPU call to disable a performance event
*/
-static int __perf_event_disable(void *info)
+int __perf_event_disable(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
@@ -2935,12 +2936,12 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
/*
* Called when the last reference to the file is gone.
*/
-static int perf_release(struct inode *inode, struct file *file)
+static void put_event(struct perf_event *event)
{
- struct perf_event *event = file->private_data;
struct task_struct *owner;
- file->private_data = NULL;
+ if (!atomic_long_dec_and_test(&event->refcount))
+ return;
rcu_read_lock();
owner = ACCESS_ONCE(event->owner);
@@ -2975,7 +2976,13 @@ static int perf_release(struct inode *inode, struct file *file)
put_task_struct(owner);
}
- return perf_event_release_kernel(event);
+ perf_event_release_kernel(event);
+}
+
+static int perf_release(struct inode *inode, struct file *file)
+{
+ put_event(file->private_data);
+ return 0;
}
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
@@ -3227,7 +3234,7 @@ unlock:
static const struct file_operations perf_fops;
-static struct perf_event *perf_fget_light(int fd, int *fput_needed)
+static struct file *perf_fget_light(int fd, int *fput_needed)
{
struct file *file;
@@ -3241,7 +3248,7 @@ static struct perf_event *perf_fget_light(int fd, int *fput_needed)
return ERR_PTR(-EBADF);
}
- return file->private_data;
+ return file;
}
static int perf_event_set_output(struct perf_event *event,
@@ -3273,19 +3280,21 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case PERF_EVENT_IOC_SET_OUTPUT:
{
+ struct file *output_file = NULL;
struct perf_event *output_event = NULL;
int fput_needed = 0;
int ret;
if (arg != -1) {
- output_event = perf_fget_light(arg, &fput_needed);
- if (IS_ERR(output_event))
- return PTR_ERR(output_event);
+ output_file = perf_fget_light(arg, &fput_needed);
+ if (IS_ERR(output_file))
+ return PTR_ERR(output_file);
+ output_event = output_file->private_data;
}
ret = perf_event_set_output(event, output_event);
if (output_event)
- fput_light(output_event->filp, fput_needed);
+ fput_light(output_file, fput_needed);
return ret;
}
@@ -3756,6 +3765,132 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
+static void
+perf_output_sample_regs(struct perf_output_handle *handle,
+ struct pt_regs *regs, u64 mask)
+{
+ int bit;
+
+ for_each_set_bit(bit, (const unsigned long *) &mask,
+ sizeof(mask) * BITS_PER_BYTE) {
+ u64 val;
+
+ val = perf_reg_value(regs, bit);
+ perf_output_put(handle, val);
+ }
+}
+
+static void perf_sample_regs_user(struct perf_regs_user *regs_user,
+ struct pt_regs *regs)
+{
+ if (!user_mode(regs)) {
+ if (current->mm)
+ regs = task_pt_regs(current);
+ else
+ regs = NULL;
+ }
+
+ if (regs) {
+ regs_user->regs = regs;
+ regs_user->abi = perf_reg_abi(current);
+ }
+}
+
+/*
+ * Get remaining task size from user stack pointer.
+ *
+ * It'd be better to take stack vma map and limit this more
+ * precisly, but there's no way to get it safely under interrupt,
+ * so using TASK_SIZE as limit.
+ */
+static u64 perf_ustack_task_size(struct pt_regs *regs)
+{
+ unsigned long addr = perf_user_stack_pointer(regs);
+
+ if (!addr || addr >= TASK_SIZE)
+ return 0;
+
+ return TASK_SIZE - addr;
+}
+
+static u16
+perf_sample_ustack_size(u16 stack_size, u16 header_size,
+ struct pt_regs *regs)
+{
+ u64 task_size;
+
+ /* No regs, no stack pointer, no dump. */
+ if (!regs)
+ return 0;
+
+ /*
+ * Check if we fit in with the requested stack size into the:
+ * - TASK_SIZE
+ * If we don't, we limit the size to the TASK_SIZE.
+ *
+ * - remaining sample size
+ * If we don't, we customize the stack size to
+ * fit in to the remaining sample size.
+ */
+
+ task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
+ stack_size = min(stack_size, (u16) task_size);
+
+ /* Current header size plus static size and dynamic size. */
+ header_size += 2 * sizeof(u64);
+
+ /* Do we fit in with the current stack dump size? */
+ if ((u16) (header_size + stack_size) < header_size) {
+ /*
+ * If we overflow the maximum size for the sample,
+ * we customize the stack dump size to fit in.
+ */
+ stack_size = USHRT_MAX - header_size - sizeof(u64);
+ stack_size = round_up(stack_size, sizeof(u64));
+ }
+
+ return stack_size;
+}
+
+static void
+perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
+ struct pt_regs *regs)
+{
+ /* Case of a kernel thread, nothing to dump */
+ if (!regs) {
+ u64 size = 0;
+ perf_output_put(handle, size);
+ } else {
+ unsigned long sp;
+ unsigned int rem;
+ u64 dyn_size;
+
+ /*
+ * We dump:
+ * static size
+ * - the size requested by user or the best one we can fit
+ * in to the sample max size
+ * data
+ * - user stack dump data
+ * dynamic size
+ * - the actual dumped size
+ */
+
+ /* Static size. */
+ perf_output_put(handle, dump_size);
+
+ /* Data. */
+ sp = perf_user_stack_pointer(regs);
+ rem = __output_copy_user(handle, (void *) sp, dump_size);
+ dyn_size = dump_size - rem;
+
+ perf_output_skip(handle, rem);
+
+ /* Dynamic size. */
+ perf_output_put(handle, dyn_size);
+ }
+}
+
static void __perf_event_header__init_id(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event)
@@ -4016,6 +4151,28 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_put(handle, nr);
}
}
+
+ if (sample_type & PERF_SAMPLE_REGS_USER) {
+ u64 abi = data->regs_user.abi;
+
+ /*
+ * If there are no regs to dump, notice it through
+ * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
+ */
+ perf_output_put(handle, abi);
+
+ if (abi) {
+ u64 mask = event->attr.sample_regs_user;
+ perf_output_sample_regs(handle,
+ data->regs_user.regs,
+ mask);
+ }
+ }
+
+ if (sample_type & PERF_SAMPLE_STACK_USER)
+ perf_output_sample_ustack(handle,
+ data->stack_user_size,
+ data->regs_user.regs);
}
void perf_prepare_sample(struct perf_event_header *header,
@@ -4039,7 +4196,7 @@ void perf_prepare_sample(struct perf_event_header *header,
if (sample_type & PERF_SAMPLE_CALLCHAIN) {
int size = 1;
- data->callchain = perf_callchain(regs);
+ data->callchain = perf_callchain(event, regs);
if (data->callchain)
size += data->callchain->nr;
@@ -4067,6 +4224,49 @@ void perf_prepare_sample(struct perf_event_header *header,
}
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_REGS_USER) {
+ /* regs dump ABI info */
+ int size = sizeof(u64);
+
+ perf_sample_regs_user(&data->regs_user, regs);
+
+ if (data->regs_user.regs) {
+ u64 mask = event->attr.sample_regs_user;
+ size += hweight64(mask) * sizeof(u64);
+ }
+
+ header->size += size;
+ }
+
+ if (sample_type & PERF_SAMPLE_STACK_USER) {
+ /*
+ * Either we need PERF_SAMPLE_STACK_USER bit to be allways
+ * processed as the last one or have additional check added
+ * in case new sample type is added, because we could eat
+ * up the rest of the sample size.
+ */
+ struct perf_regs_user *uregs = &data->regs_user;
+ u16 stack_size = event->attr.sample_stack_user;
+ u16 size = sizeof(u64);
+
+ if (!uregs->abi)
+ perf_sample_regs_user(uregs, regs);
+
+ stack_size = perf_sample_ustack_size(stack_size, header->size,
+ uregs->regs);
+
+ /*
+ * If there is something to dump, add space for the dump
+ * itself and for the field that tells the dynamic size,
+ * which is how many have been actually dumped.
+ */
+ if (stack_size)
+ size += sizeof(u64) + stack_size;
+
+ data->stack_user_size = stack_size;
+ header->size += size;
+ }
}
static void perf_event_output(struct perf_event *event,
@@ -5209,7 +5409,8 @@ static int perf_tp_event_match(struct perf_event *event,
}
void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
- struct pt_regs *regs, struct hlist_head *head, int rctx)
+ struct pt_regs *regs, struct hlist_head *head, int rctx,
+ struct task_struct *task)
{
struct perf_sample_data data;
struct perf_event *event;
@@ -5228,6 +5429,31 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
perf_swevent_event(event, count, &data, regs);
}
+ /*
+ * If we got specified a target task, also iterate its context and
+ * deliver this event there too.
+ */
+ if (task && task != current) {
+ struct perf_event_context *ctx;
+ struct trace_entry *entry = record;
+
+ rcu_read_lock();
+ ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
+ if (!ctx)
+ goto unlock;
+
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->attr.type != PERF_TYPE_TRACEPOINT)
+ continue;
+ if (event->attr.config != entry->type)
+ continue;
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
+unlock:
+ rcu_read_unlock();
+ }
+
perf_swevent_put_recursion_context(rctx);
}
EXPORT_SYMBOL_GPL(perf_tp_event);
@@ -5924,6 +6150,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
mutex_init(&event->mmap_mutex);
+ atomic_long_set(&event->refcount, 1);
event->cpu = cpu;
event->attr = *attr;
event->group_leader = group_leader;
@@ -6116,6 +6343,28 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
attr->branch_sample_type = mask;
}
}
+
+ if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
+ ret = perf_reg_validate(attr->sample_regs_user);
+ if (ret)
+ return ret;
+ }
+
+ if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
+ if (!arch_perf_have_user_stack_dump())
+ return -ENOSYS;
+
+ /*
+ * We have __u32 type for the size, but so far
+ * we can only use __u16 as maximum due to the
+ * __u16 sample size limit.
+ */
+ if (attr->sample_stack_user >= USHRT_MAX)
+ ret = -EINVAL;
+ else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
+ ret = -EINVAL;
+ }
+
out:
return ret;
@@ -6234,12 +6483,12 @@ SYSCALL_DEFINE5(perf_event_open,
return event_fd;
if (group_fd != -1) {
- group_leader = perf_fget_light(group_fd, &fput_needed);
- if (IS_ERR(group_leader)) {
- err = PTR_ERR(group_leader);
+ group_file = perf_fget_light(group_fd, &fput_needed);
+ if (IS_ERR(group_file)) {
+ err = PTR_ERR(group_file);
goto err_fd;
}
- group_file = group_leader->filp;
+ group_leader = group_file->private_data;
if (flags & PERF_FLAG_FD_OUTPUT)
output_event = group_leader;
if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6376,7 +6625,6 @@ SYSCALL_DEFINE5(perf_event_open,
put_ctx(gctx);
}
- event->filp = event_file;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
@@ -6470,7 +6718,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
goto err_free;
}
- event->filp = NULL;
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
perf_install_in_context(ctx, event, cpu);
@@ -6552,7 +6799,7 @@ static void sync_child_event(struct perf_event *child_event,
* Release the parent event, if this was the last
* reference to it.
*/
- fput(parent_event->filp);
+ put_event(parent_event);
}
static void
@@ -6628,9 +6875,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*
* __perf_event_exit_task()
* sync_child_event()
- * fput(parent_event->filp)
- * perf_release()
- * mutex_lock(&ctx->mutex)
+ * put_event()
+ * mutex_lock(&ctx->mutex)
*
* But since its the parent context it won't be the same instance.
*/
@@ -6698,7 +6944,7 @@ static void perf_free_event(struct perf_event *event,
list_del_init(&event->child_list);
mutex_unlock(&parent->child_mutex);
- fput(parent->filp);
+ put_event(parent);
perf_group_detach(event);
list_del_event(event, ctx);
@@ -6778,6 +7024,12 @@ inherit_event(struct perf_event *parent_event,
NULL, NULL);
if (IS_ERR(child_event))
return child_event;
+
+ if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ free_event(child_event);
+ return NULL;
+ }
+
get_ctx(child_ctx);
/*
@@ -6819,14 +7071,6 @@ inherit_event(struct perf_event *parent_event,
raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
/*
- * Get a reference to the parent filp - we will fput it
- * when the child event exits. This is safe to do because
- * we are in the parent and we know that the filp still
- * exists and has a nonzero count:
- */
- atomic_long_inc(&parent_event->filp->f_count);
-
- /*
* Link this into the parent event's child list
*/
WARN_ON_ONCE(parent_event->ctx->parent_ctx);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index bb38c4d3ee1..9a7b487c6fe 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -453,7 +453,16 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
int old_type = bp->attr.bp_type;
int err = 0;
- perf_event_disable(bp);
+ /*
+ * modify_user_hw_breakpoint can be invoked with IRQs disabled and hence it
+ * will not be possible to raise IPIs that invoke __perf_event_disable.
+ * So call the function directly after making sure we are targeting the
+ * current task.
+ */
+ if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
+ __perf_event_disable(bp);
+ else
+ perf_event_disable(bp);
bp->attr.bp_addr = attr->bp_addr;
bp->attr.bp_type = attr->bp_type;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index b0b107f90af..d56a64c99a8 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -2,6 +2,7 @@
#define _KERNEL_EVENTS_INTERNAL_H
#include <linux/hardirq.h>
+#include <linux/uaccess.h>
/* Buffer handling */
@@ -76,32 +77,56 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
}
-static inline void
-__output_copy(struct perf_output_handle *handle,
- const void *buf, unsigned int len)
+#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
+static inline unsigned int \
+func_name(struct perf_output_handle *handle, \
+ const void *buf, unsigned int len) \
+{ \
+ unsigned long size, written; \
+ \
+ do { \
+ size = min_t(unsigned long, handle->size, len); \
+ \
+ written = memcpy_func(handle->addr, buf, size); \
+ \
+ len -= written; \
+ handle->addr += written; \
+ buf += written; \
+ handle->size -= written; \
+ if (!handle->size) { \
+ struct ring_buffer *rb = handle->rb; \
+ \
+ handle->page++; \
+ handle->page &= rb->nr_pages - 1; \
+ handle->addr = rb->data_pages[handle->page]; \
+ handle->size = PAGE_SIZE << page_order(rb); \
+ } \
+ } while (len && written == size); \
+ \
+ return len; \
+}
+
+static inline int memcpy_common(void *dst, const void *src, size_t n)
{
- do {
- unsigned long size = min_t(unsigned long, handle->size, len);
-
- memcpy(handle->addr, buf, size);
-
- len -= size;
- handle->addr += size;
- buf += size;
- handle->size -= size;
- if (!handle->size) {
- struct ring_buffer *rb = handle->rb;
-
- handle->page++;
- handle->page &= rb->nr_pages - 1;
- handle->addr = rb->data_pages[handle->page];
- handle->size = PAGE_SIZE << page_order(rb);
- }
- } while (len);
+ memcpy(dst, src, n);
+ return n;
}
+DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
+
+#define MEMCPY_SKIP(dst, src, n) (n)
+
+DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
+
+#ifndef arch_perf_out_copy_user
+#define arch_perf_out_copy_user __copy_from_user_inatomic
+#endif
+
+DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
+
/* Callchain handling */
-extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+extern struct perf_callchain_entry *
+perf_callchain(struct perf_event *event, struct pt_regs *regs);
extern int get_callchain_buffers(void);
extern void put_callchain_buffers(void);
@@ -133,4 +158,20 @@ static inline void put_recursion_context(int *recursion, int rctx)
recursion[rctx]--;
}
+#ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+ return true;
+}
+
+#define perf_user_stack_pointer(regs) user_stack_pointer(regs)
+#else
+static inline bool arch_perf_have_user_stack_dump(void)
+{
+ return false;
+}
+
+#define perf_user_stack_pointer(regs) 0
+#endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */
+
#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 6ddaba43fb7..23cb34ff397 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -182,10 +182,16 @@ out:
return -ENOSPC;
}
-void perf_output_copy(struct perf_output_handle *handle,
+unsigned int perf_output_copy(struct perf_output_handle *handle,
const void *buf, unsigned int len)
{
- __output_copy(handle, buf, len);
+ return __output_copy(handle, buf, len);
+}
+
+unsigned int perf_output_skip(struct perf_output_handle *handle,
+ unsigned int len)
+{
+ return __output_skip(handle, NULL, len);
}
void perf_output_end(struct perf_output_handle *handle)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c08a22d02f7..912ef48d28a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -280,12 +280,10 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_
if (ret <= 0)
return ret;
- lock_page(page);
vaddr_new = kmap_atomic(page);
vaddr &= ~PAGE_MASK;
memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE);
kunmap_atomic(vaddr_new);
- unlock_page(page);
put_page(page);
@@ -334,7 +332,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
*/
result = is_swbp_at_addr(mm, vaddr);
if (result == 1)
- return -EEXIST;
+ return 0;
if (result)
return result;
@@ -347,24 +345,22 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
* @mm: the probed process address space.
* @auprobe: arch specific probepoint information.
* @vaddr: the virtual address to insert the opcode.
- * @verify: if true, verify existance of breakpoint instruction.
*
* For mm @mm, restore the original opcode (opcode) at @vaddr.
* Return 0 (success) or a negative errno.
*/
int __weak
-set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify)
+set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
{
- if (verify) {
- int result;
+ int result;
+
+ result = is_swbp_at_addr(mm, vaddr);
+ if (!result)
+ return -EINVAL;
- result = is_swbp_at_addr(mm, vaddr);
- if (!result)
- return -EINVAL;
+ if (result != 1)
+ return result;
- if (result != 1)
- return result;
- }
return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
}
@@ -415,11 +411,10 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
static struct uprobe *find_uprobe(struct inode *inode, loff_t offset)
{
struct uprobe *uprobe;
- unsigned long flags;
- spin_lock_irqsave(&uprobes_treelock, flags);
+ spin_lock(&uprobes_treelock);
uprobe = __find_uprobe(inode, offset);
- spin_unlock_irqrestore(&uprobes_treelock, flags);
+ spin_unlock(&uprobes_treelock);
return uprobe;
}
@@ -466,12 +461,11 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
*/
static struct uprobe *insert_uprobe(struct uprobe *uprobe)
{
- unsigned long flags;
struct uprobe *u;
- spin_lock_irqsave(&uprobes_treelock, flags);
+ spin_lock(&uprobes_treelock);
u = __insert_uprobe(uprobe);
- spin_unlock_irqrestore(&uprobes_treelock, flags);
+ spin_unlock(&uprobes_treelock);
/* For now assume that the instruction need not be single-stepped */
uprobe->flags |= UPROBE_SKIP_SSTEP;
@@ -649,6 +643,7 @@ static int
install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long vaddr)
{
+ bool first_uprobe;
int ret;
/*
@@ -659,7 +654,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
* Hence behave as if probe already existed.
*/
if (!uprobe->consumers)
- return -EEXIST;
+ return 0;
if (!(uprobe->flags & UPROBE_COPY_INSN)) {
ret = copy_insn(uprobe, vma->vm_file);
@@ -681,17 +676,18 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
}
/*
- * Ideally, should be updating the probe count after the breakpoint
- * has been successfully inserted. However a thread could hit the
- * breakpoint we just inserted even before the probe count is
- * incremented. If this is the first breakpoint placed, breakpoint
- * notifier might ignore uprobes and pass the trap to the thread.
- * Hence increment before and decrement on failure.
+ * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(),
+ * the task can hit this breakpoint right after __replace_page().
*/
- atomic_inc(&mm->uprobes_state.count);
+ first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags);
+ if (first_uprobe)
+ set_bit(MMF_HAS_UPROBES, &mm->flags);
+
ret = set_swbp(&uprobe->arch, mm, vaddr);
- if (ret)
- atomic_dec(&mm->uprobes_state.count);
+ if (!ret)
+ clear_bit(MMF_RECALC_UPROBES, &mm->flags);
+ else if (first_uprobe)
+ clear_bit(MMF_HAS_UPROBES, &mm->flags);
return ret;
}
@@ -699,8 +695,12 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
static void
remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
{
- if (!set_orig_insn(&uprobe->arch, mm, vaddr, true))
- atomic_dec(&mm->uprobes_state.count);
+ /* can happen if uprobe_register() fails */
+ if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
+ return;
+
+ set_bit(MMF_RECALC_UPROBES, &mm->flags);
+ set_orig_insn(&uprobe->arch, mm, vaddr);
}
/*
@@ -710,11 +710,9 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
*/
static void delete_uprobe(struct uprobe *uprobe)
{
- unsigned long flags;
-
- spin_lock_irqsave(&uprobes_treelock, flags);
+ spin_lock(&uprobes_treelock);
rb_erase(&uprobe->rb_node, &uprobes_tree);
- spin_unlock_irqrestore(&uprobes_treelock, flags);
+ spin_unlock(&uprobes_treelock);
iput(uprobe->inode);
put_uprobe(uprobe);
atomic_dec(&uprobe_events);
@@ -831,17 +829,11 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
goto unlock;
- if (is_register) {
+ if (is_register)
err = install_breakpoint(uprobe, mm, vma, info->vaddr);
- /*
- * We can race against uprobe_mmap(), see the
- * comment near uprobe_hash().
- */
- if (err == -EEXIST)
- err = 0;
- } else {
+ else
remove_breakpoint(uprobe, mm, info->vaddr);
- }
+
unlock:
up_write(&mm->mmap_sem);
free:
@@ -908,7 +900,8 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
}
mutex_unlock(uprobes_hash(inode));
- put_uprobe(uprobe);
+ if (uprobe)
+ put_uprobe(uprobe);
return ret;
}
@@ -978,7 +971,6 @@ static void build_probe_list(struct inode *inode,
struct list_head *head)
{
loff_t min, max;
- unsigned long flags;
struct rb_node *n, *t;
struct uprobe *u;
@@ -986,7 +978,7 @@ static void build_probe_list(struct inode *inode,
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
- spin_lock_irqsave(&uprobes_treelock, flags);
+ spin_lock(&uprobes_treelock);
n = find_node_in_range(inode, min, max);
if (n) {
for (t = n; t; t = rb_prev(t)) {
@@ -1004,27 +996,20 @@ static void build_probe_list(struct inode *inode,
atomic_inc(&u->ref);
}
}
- spin_unlock_irqrestore(&uprobes_treelock, flags);
+ spin_unlock(&uprobes_treelock);
}
/*
- * Called from mmap_region.
- * called with mm->mmap_sem acquired.
+ * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
*
- * Return -ve no if we fail to insert probes and we cannot
- * bail-out.
- * Return 0 otherwise. i.e:
- *
- * - successful insertion of probes
- * - (or) no possible probes to be inserted.
- * - (or) insertion of probes failed but we can bail-out.
+ * Currently we ignore all errors and always return 0, the callers
+ * can't handle the failure anyway.
*/
int uprobe_mmap(struct vm_area_struct *vma)
{
struct list_head tmp_list;
struct uprobe *uprobe, *u;
struct inode *inode;
- int ret, count;
if (!atomic_read(&uprobe_events) || !valid_vma(vma, true))
return 0;
@@ -1036,44 +1021,35 @@ int uprobe_mmap(struct vm_area_struct *vma)
mutex_lock(uprobes_mmap_hash(inode));
build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
- ret = 0;
- count = 0;
-
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
- if (!ret) {
+ if (!fatal_signal_pending(current)) {
unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
-
- ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
- /*
- * We can race against uprobe_register(), see the
- * comment near uprobe_hash().
- */
- if (ret == -EEXIST) {
- ret = 0;
-
- if (!is_swbp_at_addr(vma->vm_mm, vaddr))
- continue;
-
- /*
- * Unable to insert a breakpoint, but
- * breakpoint lies underneath. Increment the
- * probe count.
- */
- atomic_inc(&vma->vm_mm->uprobes_state.count);
- }
-
- if (!ret)
- count++;
+ install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
}
put_uprobe(uprobe);
}
-
mutex_unlock(uprobes_mmap_hash(inode));
- if (ret)
- atomic_sub(count, &vma->vm_mm->uprobes_state.count);
+ return 0;
+}
- return ret;
+static bool
+vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end)
+{
+ loff_t min, max;
+ struct inode *inode;
+ struct rb_node *n;
+
+ inode = vma->vm_file->f_mapping->host;
+
+ min = vaddr_to_offset(vma, start);
+ max = min + (end - start) - 1;
+
+ spin_lock(&uprobes_treelock);
+ n = find_node_in_range(inode, min, max);
+ spin_unlock(&uprobes_treelock);
+
+ return !!n;
}
/*
@@ -1081,37 +1057,18 @@ int uprobe_mmap(struct vm_area_struct *vma)
*/
void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
{
- struct list_head tmp_list;
- struct uprobe *uprobe, *u;
- struct inode *inode;
-
if (!atomic_read(&uprobe_events) || !valid_vma(vma, false))
return;
if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
return;
- if (!atomic_read(&vma->vm_mm->uprobes_state.count))
- return;
-
- inode = vma->vm_file->f_mapping->host;
- if (!inode)
+ if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
+ test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags))
return;
- mutex_lock(uprobes_mmap_hash(inode));
- build_probe_list(inode, vma, start, end, &tmp_list);
-
- list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
- unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
- /*
- * An unregister could have removed the probe before
- * unmap. So check before we decrement the count.
- */
- if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1)
- atomic_dec(&vma->vm_mm->uprobes_state.count);
- put_uprobe(uprobe);
- }
- mutex_unlock(uprobes_mmap_hash(inode));
+ if (vma_has_uprobes(vma, start, end))
+ set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
}
/* Slot allocation for XOL */
@@ -1213,13 +1170,15 @@ void uprobe_clear_state(struct mm_struct *mm)
kfree(area);
}
-/*
- * uprobe_reset_state - Free the area allocated for slots.
- */
-void uprobe_reset_state(struct mm_struct *mm)
+void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
- mm->uprobes_state.xol_area = NULL;
- atomic_set(&mm->uprobes_state.count, 0);
+ newmm->uprobes_state.xol_area = NULL;
+
+ if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
+ set_bit(MMF_HAS_UPROBES, &newmm->flags);
+ /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
+ set_bit(MMF_RECALC_UPROBES, &newmm->flags);
+ }
}
/*
@@ -1437,6 +1396,25 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs)
return false;
}
+static void mmf_recalc_uprobes(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!valid_vma(vma, false))
+ continue;
+ /*
+ * This is not strictly accurate, we can race with
+ * uprobe_unregister() and see the already removed
+ * uprobe if delete_uprobe() was not yet called.
+ */
+ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
+ return;
+ }
+
+ clear_bit(MMF_HAS_UPROBES, &mm->flags);
+}
+
static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
{
struct mm_struct *mm = current->mm;
@@ -1458,11 +1436,24 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
} else {
*is_swbp = -EFAULT;
}
+
+ if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags))
+ mmf_recalc_uprobes(mm);
up_read(&mm->mmap_sem);
return uprobe;
}
+void __weak arch_uprobe_enable_step(struct arch_uprobe *arch)
+{
+ user_enable_single_step(current);
+}
+
+void __weak arch_uprobe_disable_step(struct arch_uprobe *arch)
+{
+ user_disable_single_step(current);
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1509,7 +1500,7 @@ static void handle_swbp(struct pt_regs *regs)
utask->state = UTASK_SSTEP;
if (!pre_ssout(uprobe, regs, bp_vaddr)) {
- user_enable_single_step(current);
+ arch_uprobe_enable_step(&uprobe->arch);
return;
}
@@ -1518,17 +1509,15 @@ cleanup_ret:
utask->active_uprobe = NULL;
utask->state = UTASK_RUNNING;
}
- if (uprobe) {
- if (!(uprobe->flags & UPROBE_SKIP_SSTEP))
+ if (!(uprobe->flags & UPROBE_SKIP_SSTEP))
- /*
- * cannot singlestep; cannot skip instruction;
- * re-execute the instruction.
- */
- instruction_pointer_set(regs, bp_vaddr);
+ /*
+ * cannot singlestep; cannot skip instruction;
+ * re-execute the instruction.
+ */
+ instruction_pointer_set(regs, bp_vaddr);
- put_uprobe(uprobe);
- }
+ put_uprobe(uprobe);
}
/*
@@ -1547,10 +1536,10 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs)
else
WARN_ON_ONCE(1);
+ arch_uprobe_disable_step(&uprobe->arch);
put_uprobe(uprobe);
utask->active_uprobe = NULL;
utask->state = UTASK_RUNNING;
- user_disable_single_step(current);
xol_free_insn_slot(current);
spin_lock_irq(&current->sighand->siglock);
@@ -1589,8 +1578,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs)
{
struct uprobe_task *utask;
- if (!current->mm || !atomic_read(&current->mm->uprobes_state.count))
- /* task is currently not uprobed */
+ if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
return 0;
utask = current->utask;
diff --git a/kernel/fork.c b/kernel/fork.c
index 3bd2280d79f..5a0e74d89a5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -353,6 +353,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
down_write(&oldmm->mmap_sem);
flush_cache_dup_mm(oldmm);
+ uprobe_dup_mmap(oldmm, mm);
/*
* Not linked in yet - no deadlock potential:
*/
@@ -454,9 +455,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
if (retval)
goto out;
-
- if (file && uprobe_mmap(tmp))
- goto out;
}
/* a new mm has just been created */
arch_dup_mmap(oldmm, mm);
@@ -839,8 +837,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
mm->pmd_huge_pte = NULL;
#endif
- uprobe_reset_state(mm);
-
if (!mm_init(mm, tsk))
goto fail_nomem;
@@ -1280,11 +1276,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
p->irq_events = 0;
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- p->hardirqs_enabled = 1;
-#else
p->hardirqs_enabled = 0;
-#endif
p->hardirq_enable_ip = 0;
p->hardirq_enable_event = 0;
p->hardirq_disable_ip = _THIS_IP_;
diff --git a/kernel/futex.c b/kernel/futex.c
index e2b0fb9a0b3..3717e7b306e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2231,11 +2231,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* @uaddr2: the pi futex we will take prior to returning to user-space
*
* The caller will wait on uaddr and will be requeued by futex_requeue() to
- * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and
- * complete the acquisition of the rt_mutex prior to returning to userspace.
- * This ensures the rt_mutex maintains an owner when it has waiters; without
- * one, the pi logic wouldn't know which task to boost/deboost, if there was a
- * need to.
+ * uaddr2 which must be PI aware and unique from uaddr. Normal wakeup will wake
+ * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
+ * userspace. This ensures the rt_mutex maintains an owner when it has waiters;
+ * without one, the pi logic would not know which task to boost/deboost, if
+ * there was a need to.
*
* We call schedule in futex_wait_queue_me() when we enqueue and return there
* via the following:
@@ -2272,6 +2272,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
struct futex_q q = futex_q_init;
int res, ret;
+ if (uaddr == uaddr2)
+ return -EINVAL;
+
if (!bitset)
return -EINVAL;
@@ -2343,7 +2346,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* signal. futex_unlock_pi() will not destroy the lock_ptr nor
* the pi_state.
*/
- WARN_ON(!&q.pi_state);
+ WARN_ON(!q.pi_state);
pi_mutex = &q.pi_state->pi_mutex;
ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
debug_rt_mutex_free_waiter(&rt_waiter);
@@ -2370,7 +2373,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* fault, unlock the rt_mutex and return the fault to userspace.
*/
if (ret == -EFAULT) {
- if (rt_mutex_owner(pi_mutex) == current)
+ if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
rt_mutex_unlock(pi_mutex);
} else if (ret == -EINTR) {
/*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index eebd6d5cfb4..57d86d07221 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -671,6 +671,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
irq_set_chip(irq, chip);
__irq_set_handler(irq, handle, 0, name);
}
+EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index b5fcd96c710..988dc58e884 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -6,6 +6,7 @@
*/
#include <linux/interrupt.h>
#include <linux/irq.h>
+#include <linux/export.h>
#include "internals.h"
@@ -57,3 +58,4 @@ struct irq_chip dummy_irq_chip = {
.irq_mask = noop,
.irq_unmask = noop,
};
+EXPORT_SYMBOL_GPL(dummy_irq_chip);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0a8e8f05962..4c69326aa77 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -944,6 +944,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
/*
+ * Drivers are often written to work w/o knowledge about the
+ * underlying irq chip implementation, so a request for a
+ * threaded irq without a primary hard irq context handler
+ * requires the ONESHOT flag to be set. Some irq chips like
+ * MSI based interrupts are per se one shot safe. Check the
+ * chip flags, so we can avoid the unmask dance at the end of
+ * the threaded handler for those.
+ */
+ if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
+ new->flags &= ~IRQF_ONESHOT;
+
+ /*
* The following block of code has to be executed atomically
*/
raw_spin_lock_irqsave(&desc->lock, flags);
@@ -1017,7 +1029,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
new->thread_mask = 1 << ffz(thread_mask);
- } else if (new->handler == irq_default_primary_handler) {
+ } else if (new->handler == irq_default_primary_handler &&
+ !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
/*
* The interrupt was requested with handler = NULL, so
* we use the default primary handler for it. But it
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c62b8546cc9..098f396aa40 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -561,9 +561,9 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
{
LIST_HEAD(free_list);
+ mutex_lock(&kprobe_mutex);
/* Lock modules while optimizing kprobes */
mutex_lock(&module_mutex);
- mutex_lock(&kprobe_mutex);
/*
* Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
@@ -586,8 +586,8 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
/* Step 4: Free cleaned kprobes after quiesence period */
do_free_cleaned_kprobes(&free_list);
- mutex_unlock(&kprobe_mutex);
mutex_unlock(&module_mutex);
+ mutex_unlock(&kprobe_mutex);
/* Step 5: Kick optimizer again if needed */
if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
@@ -759,20 +759,32 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
struct kprobe *ap;
struct optimized_kprobe *op;
+ /* Impossible to optimize ftrace-based kprobe */
+ if (kprobe_ftrace(p))
+ return;
+
+ /* For preparing optimization, jump_label_text_reserved() is called */
+ jump_label_lock();
+ mutex_lock(&text_mutex);
+
ap = alloc_aggr_kprobe(p);
if (!ap)
- return;
+ goto out;
op = container_of(ap, struct optimized_kprobe, kp);
if (!arch_prepared_optinsn(&op->optinsn)) {
/* If failed to setup optimizing, fallback to kprobe */
arch_remove_optimized_kprobe(op);
kfree(op);
- return;
+ goto out;
}
init_aggr_kprobe(ap, p);
- optimize_kprobe(ap);
+ optimize_kprobe(ap); /* This just kicks optimizer thread */
+
+out:
+ mutex_unlock(&text_mutex);
+ jump_label_unlock();
}
#ifdef CONFIG_SYSCTL
@@ -907,9 +919,64 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
}
#endif /* CONFIG_OPTPROBES */
+#ifdef KPROBES_CAN_USE_FTRACE
+static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
+ .func = kprobe_ftrace_handler,
+ .flags = FTRACE_OPS_FL_SAVE_REGS,
+};
+static int kprobe_ftrace_enabled;
+
+/* Must ensure p->addr is really on ftrace */
+static int __kprobes prepare_kprobe(struct kprobe *p)
+{
+ if (!kprobe_ftrace(p))
+ return arch_prepare_kprobe(p);
+
+ return arch_prepare_kprobe_ftrace(p);
+}
+
+/* Caller must lock kprobe_mutex */
+static void __kprobes arm_kprobe_ftrace(struct kprobe *p)
+{
+ int ret;
+
+ ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
+ (unsigned long)p->addr, 0, 0);
+ WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret);
+ kprobe_ftrace_enabled++;
+ if (kprobe_ftrace_enabled == 1) {
+ ret = register_ftrace_function(&kprobe_ftrace_ops);
+ WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
+ }
+}
+
+/* Caller must lock kprobe_mutex */
+static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
+{
+ int ret;
+
+ kprobe_ftrace_enabled--;
+ if (kprobe_ftrace_enabled == 0) {
+ ret = unregister_ftrace_function(&kprobe_ftrace_ops);
+ WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
+ }
+ ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
+ (unsigned long)p->addr, 1, 0);
+ WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
+}
+#else /* !KPROBES_CAN_USE_FTRACE */
+#define prepare_kprobe(p) arch_prepare_kprobe(p)
+#define arm_kprobe_ftrace(p) do {} while (0)
+#define disarm_kprobe_ftrace(p) do {} while (0)
+#endif
+
/* Arm a kprobe with text_mutex */
static void __kprobes arm_kprobe(struct kprobe *kp)
{
+ if (unlikely(kprobe_ftrace(kp))) {
+ arm_kprobe_ftrace(kp);
+ return;
+ }
/*
* Here, since __arm_kprobe() doesn't use stop_machine(),
* this doesn't cause deadlock on text_mutex. So, we don't
@@ -921,11 +988,15 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
}
/* Disarm a kprobe with text_mutex */
-static void __kprobes disarm_kprobe(struct kprobe *kp)
+static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt)
{
+ if (unlikely(kprobe_ftrace(kp))) {
+ disarm_kprobe_ftrace(kp);
+ return;
+ }
/* Ditto */
mutex_lock(&text_mutex);
- __disarm_kprobe(kp, true);
+ __disarm_kprobe(kp, reopt);
mutex_unlock(&text_mutex);
}
@@ -1144,12 +1215,6 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
if (p->post_handler && !ap->post_handler)
ap->post_handler = aggr_post_handler;
- if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
- ap->flags &= ~KPROBE_FLAG_DISABLED;
- if (!kprobes_all_disarmed)
- /* Arm the breakpoint again. */
- __arm_kprobe(ap);
- }
return 0;
}
@@ -1189,11 +1254,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
int ret = 0;
struct kprobe *ap = orig_p;
+ /* For preparing optimization, jump_label_text_reserved() is called */
+ jump_label_lock();
+ /*
+ * Get online CPUs to avoid text_mutex deadlock.with stop machine,
+ * which is invoked by unoptimize_kprobe() in add_new_kprobe()
+ */
+ get_online_cpus();
+ mutex_lock(&text_mutex);
+
if (!kprobe_aggrprobe(orig_p)) {
/* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */
ap = alloc_aggr_kprobe(orig_p);
- if (!ap)
- return -ENOMEM;
+ if (!ap) {
+ ret = -ENOMEM;
+ goto out;
+ }
init_aggr_kprobe(ap, orig_p);
} else if (kprobe_unused(ap))
/* This probe is going to die. Rescue it */
@@ -1213,7 +1289,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
* free aggr_probe. It will be used next time, or
* freed by unregister_kprobe.
*/
- return ret;
+ goto out;
/* Prepare optimized instructions if possible. */
prepare_optimized_kprobe(ap);
@@ -1228,7 +1304,20 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
/* Copy ap's insn slot to p */
copy_kprobe(ap, p);
- return add_new_kprobe(ap, p);
+ ret = add_new_kprobe(ap, p);
+
+out:
+ mutex_unlock(&text_mutex);
+ put_online_cpus();
+ jump_label_unlock();
+
+ if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
+ ap->flags &= ~KPROBE_FLAG_DISABLED;
+ if (!kprobes_all_disarmed)
+ /* Arm the breakpoint again. */
+ arm_kprobe(ap);
+ }
+ return ret;
}
static int __kprobes in_kprobes_functions(unsigned long addr)
@@ -1313,71 +1402,96 @@ static inline int check_kprobe_rereg(struct kprobe *p)
return ret;
}
-int __kprobes register_kprobe(struct kprobe *p)
+static __kprobes int check_kprobe_address_safe(struct kprobe *p,
+ struct module **probed_mod)
{
int ret = 0;
- struct kprobe *old_p;
- struct module *probed_mod;
- kprobe_opcode_t *addr;
-
- addr = kprobe_addr(p);
- if (IS_ERR(addr))
- return PTR_ERR(addr);
- p->addr = addr;
+ unsigned long ftrace_addr;
- ret = check_kprobe_rereg(p);
- if (ret)
- return ret;
+ /*
+ * If the address is located on a ftrace nop, set the
+ * breakpoint to the following instruction.
+ */
+ ftrace_addr = ftrace_location((unsigned long)p->addr);
+ if (ftrace_addr) {
+#ifdef KPROBES_CAN_USE_FTRACE
+ /* Given address is not on the instruction boundary */
+ if ((unsigned long)p->addr != ftrace_addr)
+ return -EILSEQ;
+ p->flags |= KPROBE_FLAG_FTRACE;
+#else /* !KPROBES_CAN_USE_FTRACE */
+ return -EINVAL;
+#endif
+ }
jump_label_lock();
preempt_disable();
+
+ /* Ensure it is not in reserved area nor out of text */
if (!kernel_text_address((unsigned long) p->addr) ||
in_kprobes_functions((unsigned long) p->addr) ||
- ftrace_text_reserved(p->addr, p->addr) ||
jump_label_text_reserved(p->addr, p->addr)) {
ret = -EINVAL;
- goto cannot_probe;
+ goto out;
}
- /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
- p->flags &= KPROBE_FLAG_DISABLED;
-
- /*
- * Check if are we probing a module.
- */
- probed_mod = __module_text_address((unsigned long) p->addr);
- if (probed_mod) {
- /* Return -ENOENT if fail. */
- ret = -ENOENT;
+ /* Check if are we probing a module */
+ *probed_mod = __module_text_address((unsigned long) p->addr);
+ if (*probed_mod) {
/*
* We must hold a refcount of the probed module while updating
* its code to prohibit unexpected unloading.
*/
- if (unlikely(!try_module_get(probed_mod)))
- goto cannot_probe;
+ if (unlikely(!try_module_get(*probed_mod))) {
+ ret = -ENOENT;
+ goto out;
+ }
/*
* If the module freed .init.text, we couldn't insert
* kprobes in there.
*/
- if (within_module_init((unsigned long)p->addr, probed_mod) &&
- probed_mod->state != MODULE_STATE_COMING) {
- module_put(probed_mod);
- goto cannot_probe;
+ if (within_module_init((unsigned long)p->addr, *probed_mod) &&
+ (*probed_mod)->state != MODULE_STATE_COMING) {
+ module_put(*probed_mod);
+ *probed_mod = NULL;
+ ret = -ENOENT;
}
- /* ret will be updated by following code */
}
+out:
preempt_enable();
jump_label_unlock();
+ return ret;
+}
+
+int __kprobes register_kprobe(struct kprobe *p)
+{
+ int ret;
+ struct kprobe *old_p;
+ struct module *probed_mod;
+ kprobe_opcode_t *addr;
+
+ /* Adjust probe address from symbol */
+ addr = kprobe_addr(p);
+ if (IS_ERR(addr))
+ return PTR_ERR(addr);
+ p->addr = addr;
+
+ ret = check_kprobe_rereg(p);
+ if (ret)
+ return ret;
+
+ /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
+ p->flags &= KPROBE_FLAG_DISABLED;
p->nmissed = 0;
INIT_LIST_HEAD(&p->list);
- mutex_lock(&kprobe_mutex);
- jump_label_lock(); /* needed to call jump_label_text_reserved() */
+ ret = check_kprobe_address_safe(p, &probed_mod);
+ if (ret)
+ return ret;
- get_online_cpus(); /* For avoiding text_mutex deadlock. */
- mutex_lock(&text_mutex);
+ mutex_lock(&kprobe_mutex);
old_p = get_kprobe(p->addr);
if (old_p) {
@@ -1386,7 +1500,9 @@ int __kprobes register_kprobe(struct kprobe *p)
goto out;
}
- ret = arch_prepare_kprobe(p);
+ mutex_lock(&text_mutex); /* Avoiding text modification */
+ ret = prepare_kprobe(p);
+ mutex_unlock(&text_mutex);
if (ret)
goto out;
@@ -1395,26 +1511,18 @@ int __kprobes register_kprobe(struct kprobe *p)
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
if (!kprobes_all_disarmed && !kprobe_disabled(p))
- __arm_kprobe(p);
+ arm_kprobe(p);
/* Try to optimize kprobe */
try_to_optimize_kprobe(p);
out:
- mutex_unlock(&text_mutex);
- put_online_cpus();
- jump_label_unlock();
mutex_unlock(&kprobe_mutex);
if (probed_mod)
module_put(probed_mod);
return ret;
-
-cannot_probe:
- preempt_enable();
- jump_label_unlock();
- return ret;
}
EXPORT_SYMBOL_GPL(register_kprobe);
@@ -1451,7 +1559,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
/* Try to disarm and disable this/parent probe */
if (p == orig_p || aggr_kprobe_disabled(orig_p)) {
- disarm_kprobe(orig_p);
+ disarm_kprobe(orig_p, true);
orig_p->flags |= KPROBE_FLAG_DISABLED;
}
}
@@ -2049,10 +2157,11 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
if (!pp)
pp = p;
- seq_printf(pi, "%s%s%s\n",
+ seq_printf(pi, "%s%s%s%s\n",
(kprobe_gone(p) ? "[GONE]" : ""),
((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""),
- (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""));
+ (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""),
+ (kprobe_ftrace(pp) ? "[FTRACE]" : ""));
}
static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -2131,14 +2240,12 @@ static void __kprobes arm_all_kprobes(void)
goto already_enabled;
/* Arming kprobes doesn't optimize kprobe itself */
- mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist)
if (!kprobe_disabled(p))
- __arm_kprobe(p);
+ arm_kprobe(p);
}
- mutex_unlock(&text_mutex);
kprobes_all_disarmed = false;
printk(KERN_INFO "Kprobes globally enabled\n");
@@ -2166,15 +2273,13 @@ static void __kprobes disarm_all_kprobes(void)
kprobes_all_disarmed = true;
printk(KERN_INFO "Kprobes globally disabled\n");
- mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist) {
if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
- __disarm_kprobe(p, false);
+ disarm_kprobe(p, false);
}
}
- mutex_unlock(&text_mutex);
mutex_unlock(&kprobe_mutex);
/* Wait for disarming all kprobes by optimizer */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b579af57ea1..146a6fa9682 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -37,11 +37,20 @@ struct kthread_create_info
};
struct kthread {
- int should_stop;
+ unsigned long flags;
+ unsigned int cpu;
void *data;
+ struct completion parked;
struct completion exited;
};
+enum KTHREAD_BITS {
+ KTHREAD_IS_PER_CPU = 0,
+ KTHREAD_SHOULD_STOP,
+ KTHREAD_SHOULD_PARK,
+ KTHREAD_IS_PARKED,
+};
+
#define to_kthread(tsk) \
container_of((tsk)->vfork_done, struct kthread, exited)
@@ -52,13 +61,29 @@ struct kthread {
* and this will return true. You should then return, and your return
* value will be passed through to kthread_stop().
*/
-int kthread_should_stop(void)
+bool kthread_should_stop(void)
{
- return to_kthread(current)->should_stop;
+ return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags);
}
EXPORT_SYMBOL(kthread_should_stop);
/**
+ * kthread_should_park - should this kthread park now?
+ *
+ * When someone calls kthread_park() on your kthread, it will be woken
+ * and this will return true. You should then do the necessary
+ * cleanup and call kthread_parkme()
+ *
+ * Similar to kthread_should_stop(), but this keeps the thread alive
+ * and in a park position. kthread_unpark() "restarts" the thread and
+ * calls the thread function again.
+ */
+bool kthread_should_park(void)
+{
+ return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
+}
+
+/**
* kthread_freezable_should_stop - should this freezable kthread return now?
* @was_frozen: optional out parameter, indicates whether %current was frozen
*
@@ -96,6 +121,24 @@ void *kthread_data(struct task_struct *task)
return to_kthread(task)->data;
}
+static void __kthread_parkme(struct kthread *self)
+{
+ __set_current_state(TASK_INTERRUPTIBLE);
+ while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
+ if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
+ complete(&self->parked);
+ schedule();
+ __set_current_state(TASK_INTERRUPTIBLE);
+ }
+ clear_bit(KTHREAD_IS_PARKED, &self->flags);
+ __set_current_state(TASK_RUNNING);
+}
+
+void kthread_parkme(void)
+{
+ __kthread_parkme(to_kthread(current));
+}
+
static int kthread(void *_create)
{
/* Copy data: it's on kthread's stack */
@@ -105,9 +148,10 @@ static int kthread(void *_create)
struct kthread self;
int ret;
- self.should_stop = 0;
+ self.flags = 0;
self.data = data;
init_completion(&self.exited);
+ init_completion(&self.parked);
current->vfork_done = &self.exited;
/* OK, tell user we're spawned, wait for stop or wakeup */
@@ -117,9 +161,11 @@ static int kthread(void *_create)
schedule();
ret = -EINTR;
- if (!self.should_stop)
- ret = threadfn(data);
+ if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
+ __kthread_parkme(&self);
+ ret = threadfn(data);
+ }
/* we can't just return, we must preserve "self" on stack */
do_exit(ret);
}
@@ -172,8 +218,7 @@ static void create_kthread(struct kthread_create_info *create)
* Returns a task_struct or ERR_PTR(-ENOMEM).
*/
struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
- void *data,
- int node,
+ void *data, int node,
const char namefmt[],
...)
{
@@ -210,6 +255,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create_on_node);
+static void __kthread_bind(struct task_struct *p, unsigned int cpu)
+{
+ /* It's safe because the task is inactive. */
+ do_set_cpus_allowed(p, cpumask_of(cpu));
+ p->flags |= PF_THREAD_BOUND;
+}
+
/**
* kthread_bind - bind a just-created kthread to a cpu.
* @p: thread created by kthread_create().
@@ -226,14 +278,112 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
WARN_ON(1);
return;
}
-
- /* It's safe because the task is inactive. */
- do_set_cpus_allowed(p, cpumask_of(cpu));
- p->flags |= PF_THREAD_BOUND;
+ __kthread_bind(p, cpu);
}
EXPORT_SYMBOL(kthread_bind);
/**
+ * kthread_create_on_cpu - Create a cpu bound kthread
+ * @threadfn: the function to run until signal_pending(current).
+ * @data: data ptr for @threadfn.
+ * @cpu: The cpu on which the thread should be bound,
+ * @namefmt: printf-style name for the thread. Format is restricted
+ * to "name.*%u". Code fills in cpu number.
+ *
+ * Description: This helper function creates and names a kernel thread
+ * The thread will be woken and put into park mode.
+ */
+struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
+ void *data, unsigned int cpu,
+ const char *namefmt)
+{
+ struct task_struct *p;
+
+ p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
+ cpu);
+ if (IS_ERR(p))
+ return p;
+ set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
+ to_kthread(p)->cpu = cpu;
+ /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */
+ kthread_park(p);
+ return p;
+}
+
+static struct kthread *task_get_live_kthread(struct task_struct *k)
+{
+ struct kthread *kthread;
+
+ get_task_struct(k);
+ kthread = to_kthread(k);
+ /* It might have exited */
+ barrier();
+ if (k->vfork_done != NULL)
+ return kthread;
+ return NULL;
+}
+
+/**
+ * kthread_unpark - unpark a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return false, wakes it, and
+ * waits for it to return. If the thread is marked percpu then its
+ * bound to the cpu again.
+ */
+void kthread_unpark(struct task_struct *k)
+{
+ struct kthread *kthread = task_get_live_kthread(k);
+
+ if (kthread) {
+ clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ /*
+ * We clear the IS_PARKED bit here as we don't wait
+ * until the task has left the park code. So if we'd
+ * park before that happens we'd see the IS_PARKED bit
+ * which might be about to be cleared.
+ */
+ if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+ __kthread_bind(k, kthread->cpu);
+ wake_up_process(k);
+ }
+ }
+ put_task_struct(k);
+}
+
+/**
+ * kthread_park - park a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ *
+ * Sets kthread_should_park() for @k to return true, wakes it, and
+ * waits for it to return. This can also be called after kthread_create()
+ * instead of calling wake_up_process(): the thread will park without
+ * calling threadfn().
+ *
+ * Returns 0 if the thread is parked, -ENOSYS if the thread exited.
+ * If called by the kthread itself just the park bit is set.
+ */
+int kthread_park(struct task_struct *k)
+{
+ struct kthread *kthread = task_get_live_kthread(k);
+ int ret = -ENOSYS;
+
+ if (kthread) {
+ if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ if (k != current) {
+ wake_up_process(k);
+ wait_for_completion(&kthread->parked);
+ }
+ }
+ ret = 0;
+ }
+ put_task_struct(k);
+ return ret;
+}
+
+/**
* kthread_stop - stop a thread created by kthread_create().
* @k: thread created by kthread_create().
*
@@ -250,16 +400,13 @@ EXPORT_SYMBOL(kthread_bind);
*/
int kthread_stop(struct task_struct *k)
{
- struct kthread *kthread;
+ struct kthread *kthread = task_get_live_kthread(k);
int ret;
trace_sched_kthread_stop(k);
- get_task_struct(k);
-
- kthread = to_kthread(k);
- barrier(); /* it might have exited */
- if (k->vfork_done != NULL) {
- kthread->should_stop = 1;
+ if (kthread) {
+ set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
+ clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
wake_up_process(k);
wait_for_completion(&kthread->exited);
}
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index ea9ee4518c3..7981e5b2350 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2998,6 +2998,42 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
struct lock_class_key __lockdep_no_validate__;
+static int
+print_lock_nested_lock_not_held(struct task_struct *curr,
+ struct held_lock *hlock,
+ unsigned long ip)
+{
+ if (!debug_locks_off())
+ return 0;
+ if (debug_locks_silent)
+ return 0;
+
+ printk("\n");
+ printk("==================================\n");
+ printk("[ BUG: Nested lock was not taken ]\n");
+ print_kernel_ident();
+ printk("----------------------------------\n");
+
+ printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr));
+ print_lock(hlock);
+
+ printk("\nbut this task is not holding:\n");
+ printk("%s\n", hlock->nest_lock->name);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ printk("\nother info that might help us debug this:\n");
+ lockdep_print_held_locks(curr);
+
+ printk("\nstack backtrace:\n");
+ dump_stack();
+
+ return 0;
+}
+
+static int __lock_is_held(struct lockdep_map *lock);
+
/*
* This gets called for every mutex_lock*()/spin_lock*() operation.
* We maintain the dependency maps and validate the locking attempt:
@@ -3139,6 +3175,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
chain_key = iterate_chain_key(chain_key, id);
+ if (nest_lock && !__lock_is_held(nest_lock))
+ return print_lock_nested_lock_not_held(curr, hlock, ip);
+
if (!validate_chain(curr, lock, hlock, chain_head, chain_key))
return 0;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index b3c7fd55425..6144bab8fd8 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -232,15 +232,19 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
*/
tmp.data = &current->nsproxy->pid_ns->last_pid;
- return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
}
+extern int pid_max;
+static int zero = 0;
static struct ctl_table pid_ns_ctl_table[] = {
{
.procname = "ns_last_pid",
.maxlen = sizeof(int),
.mode = 0666, /* permissions are checked in the handler */
.proc_handler = pid_ns_ctl_handler,
+ .extra1 = &zero,
+ .extra2 = &pid_max,
},
{ }
};
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 1da39ea248f..c8b7446b27d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -178,9 +178,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
arch_suspend_enable_irqs();
BUG_ON(irqs_disabled());
- /* Kick the lockup detector */
- lockup_detector_bootcpu_resume();
-
Enable_cpus:
enable_nonboot_cpus();
diff --git a/kernel/printk.c b/kernel/printk.c
index 6a76ab9d447..66a2ea37b57 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1034,6 +1034,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
struct log *msg = log_from_idx(idx);
len += msg_print_text(msg, prev, true, NULL, 0);
+ prev = msg->flags;
idx = log_next(idx);
seq++;
}
@@ -1046,6 +1047,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
struct log *msg = log_from_idx(idx);
len -= msg_print_text(msg, prev, true, NULL, 0);
+ prev = msg->flags;
idx = log_next(idx);
seq++;
}
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4e6a61b15e8..29ca1c6da59 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,7 @@
#include <linux/mutex.h>
#include <linux/export.h>
#include <linux/hardirq.h>
+#include <linux/delay.h>
#define CREATE_TRACE_POINTS
#include <trace/events/rcu.h>
@@ -81,6 +82,9 @@ void __rcu_read_unlock(void)
} else {
barrier(); /* critical section before exit code. */
t->rcu_read_lock_nesting = INT_MIN;
+#ifdef CONFIG_PROVE_RCU_DELAY
+ udelay(10); /* Make preemption more probable. */
+#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
barrier(); /* assign before ->rcu_read_unlock_special load */
if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
rcu_read_unlock_special(t);
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 547b1fe5b05..e4c6a598d6f 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -56,25 +56,28 @@ static void __call_rcu(struct rcu_head *head,
static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
-static void rcu_idle_enter_common(long long oldval)
+static void rcu_idle_enter_common(long long newval)
{
- if (rcu_dynticks_nesting) {
+ if (newval) {
RCU_TRACE(trace_rcu_dyntick("--=",
- oldval, rcu_dynticks_nesting));
+ rcu_dynticks_nesting, newval));
+ rcu_dynticks_nesting = newval;
return;
}
- RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting));
+ RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval));
if (!is_idle_task(current)) {
struct task_struct *idle = idle_task(smp_processor_id());
RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
- oldval, rcu_dynticks_nesting));
+ rcu_dynticks_nesting, newval));
ftrace_dump(DUMP_ALL);
WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
current->pid, current->comm,
idle->pid, idle->comm); /* must be idle task! */
}
rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
+ barrier();
+ rcu_dynticks_nesting = newval;
}
/*
@@ -84,17 +87,16 @@ static void rcu_idle_enter_common(long long oldval)
void rcu_idle_enter(void)
{
unsigned long flags;
- long long oldval;
+ long long newval;
local_irq_save(flags);
- oldval = rcu_dynticks_nesting;
WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0);
if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) ==
DYNTICK_TASK_NEST_VALUE)
- rcu_dynticks_nesting = 0;
+ newval = 0;
else
- rcu_dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
- rcu_idle_enter_common(oldval);
+ newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE;
+ rcu_idle_enter_common(newval);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -105,15 +107,15 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
void rcu_irq_exit(void)
{
unsigned long flags;
- long long oldval;
+ long long newval;
local_irq_save(flags);
- oldval = rcu_dynticks_nesting;
- rcu_dynticks_nesting--;
- WARN_ON_ONCE(rcu_dynticks_nesting < 0);
- rcu_idle_enter_common(oldval);
+ newval = rcu_dynticks_nesting - 1;
+ WARN_ON_ONCE(newval < 0);
+ rcu_idle_enter_common(newval);
local_irq_restore(flags);
}
+EXPORT_SYMBOL_GPL(rcu_irq_exit);
/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
static void rcu_idle_exit_common(long long oldval)
@@ -171,6 +173,7 @@ void rcu_irq_enter(void)
rcu_idle_exit_common(oldval);
local_irq_restore(flags);
}
+EXPORT_SYMBOL_GPL(rcu_irq_enter);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 918fd1e8509..3d019028220 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -278,7 +278,7 @@ static int rcu_boost(void)
rcu_preempt_ctrlblk.exp_tasks == NULL)
return 0; /* Nothing to boost. */
- raw_local_irq_save(flags);
+ local_irq_save(flags);
/*
* Recheck with irqs disabled: all tasks in need of boosting
@@ -287,7 +287,7 @@ static int rcu_boost(void)
*/
if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
rcu_preempt_ctrlblk.exp_tasks == NULL) {
- raw_local_irq_restore(flags);
+ local_irq_restore(flags);
return 0;
}
@@ -317,7 +317,7 @@ static int rcu_boost(void)
t = container_of(tb, struct task_struct, rcu_node_entry);
rt_mutex_init_proxy_locked(&mtx, t);
t->rcu_boost_mutex = &mtx;
- raw_local_irq_restore(flags);
+ local_irq_restore(flags);
rt_mutex_lock(&mtx);
rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
@@ -991,9 +991,9 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n)
{
unsigned long flags;
- raw_local_irq_save(flags);
+ local_irq_save(flags);
rcp->qlen -= n;
- raw_local_irq_restore(flags);
+ local_irq_restore(flags);
}
/*
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 25b15033c61..aaa7b9f3532 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -53,10 +53,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@fre
static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
static int nfakewriters = 4; /* # fake writer threads */
-static int stat_interval; /* Interval between stats, in seconds. */
- /* Defaults to "only at end of test". */
+static int stat_interval = 60; /* Interval between stats, in seconds. */
+ /* Zero means "only at end of test". */
static bool verbose; /* Print more debug info. */
-static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
+static bool test_no_idle_hz = true;
+ /* Test RCU support for tickless idle CPUs. */
static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
static int stutter = 5; /* Start/stop testing interval (in sec) */
static int irqreader = 1; /* RCU readers from irq (timers). */
@@ -119,11 +120,11 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
#define TORTURE_FLAG "-torture:"
#define PRINTK_STRING(s) \
- do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
+ do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
#define VERBOSE_PRINTK_STRING(s) \
- do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
+ do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
#define VERBOSE_PRINTK_ERRSTRING(s) \
- do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
+ do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
static char printk_buf[4096];
@@ -176,8 +177,14 @@ static long n_rcu_torture_boosts;
static long n_rcu_torture_timers;
static long n_offline_attempts;
static long n_offline_successes;
+static unsigned long sum_offline;
+static int min_offline = -1;
+static int max_offline;
static long n_online_attempts;
static long n_online_successes;
+static unsigned long sum_online;
+static int min_online = -1;
+static int max_online;
static long n_barrier_attempts;
static long n_barrier_successes;
static struct list_head rcu_torture_removed;
@@ -235,7 +242,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
if (fullstop == FULLSTOP_DONTSTOP)
fullstop = FULLSTOP_SHUTDOWN;
else
- printk(KERN_WARNING /* but going down anyway, so... */
+ pr_warn(/* but going down anyway, so... */
"Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
mutex_unlock(&fullstop_mutex);
return NOTIFY_DONE;
@@ -248,7 +255,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1,
static void rcutorture_shutdown_absorb(char *title)
{
if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
- printk(KERN_NOTICE
+ pr_notice(
"rcutorture thread %s parking due to system shutdown\n",
title);
schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
@@ -1214,11 +1221,13 @@ rcu_torture_printk(char *page)
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
n_rcu_torture_timers);
- cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ",
- n_online_successes,
- n_online_attempts,
- n_offline_successes,
- n_offline_attempts);
+ cnt += sprintf(&page[cnt],
+ "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
+ n_online_successes, n_online_attempts,
+ n_offline_successes, n_offline_attempts,
+ min_online, max_online,
+ min_offline, max_offline,
+ sum_online, sum_offline, HZ);
cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld",
n_barrier_successes,
n_barrier_attempts,
@@ -1267,7 +1276,7 @@ rcu_torture_stats_print(void)
int cnt;
cnt = rcu_torture_printk(printk_buf);
- printk(KERN_ALERT "%s", printk_buf);
+ pr_alert("%s", printk_buf);
}
/*
@@ -1380,20 +1389,20 @@ rcu_torture_stutter(void *arg)
static inline void
rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
{
- printk(KERN_ALERT "%s" TORTURE_FLAG
- "--- %s: nreaders=%d nfakewriters=%d "
- "stat_interval=%d verbose=%d test_no_idle_hz=%d "
- "shuffle_interval=%d stutter=%d irqreader=%d "
- "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
- "test_boost=%d/%d test_boost_interval=%d "
- "test_boost_duration=%d shutdown_secs=%d "
- "onoff_interval=%d onoff_holdoff=%d\n",
- torture_type, tag, nrealreaders, nfakewriters,
- stat_interval, verbose, test_no_idle_hz, shuffle_interval,
- stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
- test_boost, cur_ops->can_boost,
- test_boost_interval, test_boost_duration, shutdown_secs,
- onoff_interval, onoff_holdoff);
+ pr_alert("%s" TORTURE_FLAG
+ "--- %s: nreaders=%d nfakewriters=%d "
+ "stat_interval=%d verbose=%d test_no_idle_hz=%d "
+ "shuffle_interval=%d stutter=%d irqreader=%d "
+ "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
+ "test_boost=%d/%d test_boost_interval=%d "
+ "test_boost_duration=%d shutdown_secs=%d "
+ "onoff_interval=%d onoff_holdoff=%d\n",
+ torture_type, tag, nrealreaders, nfakewriters,
+ stat_interval, verbose, test_no_idle_hz, shuffle_interval,
+ stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
+ test_boost, cur_ops->can_boost,
+ test_boost_interval, test_boost_duration, shutdown_secs,
+ onoff_interval, onoff_holdoff);
}
static struct notifier_block rcutorture_shutdown_nb = {
@@ -1460,9 +1469,9 @@ rcu_torture_shutdown(void *arg)
!kthread_should_stop()) {
delta = shutdown_time - jiffies_snap;
if (verbose)
- printk(KERN_ALERT "%s" TORTURE_FLAG
- "rcu_torture_shutdown task: %lu jiffies remaining\n",
- torture_type, delta);
+ pr_alert("%s" TORTURE_FLAG
+ "rcu_torture_shutdown task: %lu jiffies remaining\n",
+ torture_type, delta);
schedule_timeout_interruptible(delta);
jiffies_snap = ACCESS_ONCE(jiffies);
}
@@ -1490,8 +1499,10 @@ static int __cpuinit
rcu_torture_onoff(void *arg)
{
int cpu;
+ unsigned long delta;
int maxcpu = -1;
DEFINE_RCU_RANDOM(rand);
+ unsigned long starttime;
VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
for_each_online_cpu(cpu)
@@ -1506,29 +1517,51 @@ rcu_torture_onoff(void *arg)
cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
if (verbose)
- printk(KERN_ALERT "%s" TORTURE_FLAG
- "rcu_torture_onoff task: offlining %d\n",
- torture_type, cpu);
+ pr_alert("%s" TORTURE_FLAG
+ "rcu_torture_onoff task: offlining %d\n",
+ torture_type, cpu);
+ starttime = jiffies;
n_offline_attempts++;
if (cpu_down(cpu) == 0) {
if (verbose)
- printk(KERN_ALERT "%s" TORTURE_FLAG
- "rcu_torture_onoff task: offlined %d\n",
- torture_type, cpu);
+ pr_alert("%s" TORTURE_FLAG
+ "rcu_torture_onoff task: offlined %d\n",
+ torture_type, cpu);
n_offline_successes++;
+ delta = jiffies - starttime;
+ sum_offline += delta;
+ if (min_offline < 0) {
+ min_offline = delta;
+ max_offline = delta;
+ }
+ if (min_offline > delta)
+ min_offline = delta;
+ if (max_offline < delta)
+ max_offline = delta;
}
} else if (cpu_is_hotpluggable(cpu)) {
if (verbose)
- printk(KERN_ALERT "%s" TORTURE_FLAG
- "rcu_torture_onoff task: onlining %d\n",
- torture_type, cpu);
+ pr_alert("%s" TORTURE_FLAG
+ "rcu_torture_onoff task: onlining %d\n",
+ torture_type, cpu);
+ starttime = jiffies;
n_online_attempts++;
if (cpu_up(cpu) == 0) {
if (verbose)
- printk(KERN_ALERT "%s" TORTURE_FLAG
- "rcu_torture_onoff task: onlined %d\n",
- torture_type, cpu);
+ pr_alert("%s" TORTURE_FLAG
+ "rcu_torture_onoff task: onlined %d\n",
+ torture_type, cpu);
n_online_successes++;
+ delta = jiffies - starttime;
+ sum_online += delta;
+ if (min_online < 0) {
+ min_online = delta;
+ max_online = delta;
+ }
+ if (min_online > delta)
+ min_online = delta;
+ if (max_online < delta)
+ max_online = delta;
}
}
schedule_timeout_interruptible(onoff_interval * HZ);
@@ -1593,14 +1626,14 @@ static int __cpuinit rcu_torture_stall(void *args)
if (!kthread_should_stop()) {
stop_at = get_seconds() + stall_cpu;
/* RCU CPU stall is expected behavior in following code. */
- printk(KERN_ALERT "rcu_torture_stall start.\n");
+ pr_alert("rcu_torture_stall start.\n");
rcu_read_lock();
preempt_disable();
while (ULONG_CMP_LT(get_seconds(), stop_at))
continue; /* Induce RCU CPU stall warning. */
preempt_enable();
rcu_read_unlock();
- printk(KERN_ALERT "rcu_torture_stall end.\n");
+ pr_alert("rcu_torture_stall end.\n");
}
rcutorture_shutdown_absorb("rcu_torture_stall");
while (!kthread_should_stop())
@@ -1716,12 +1749,12 @@ static int rcu_torture_barrier_init(void)
if (n_barrier_cbs == 0)
return 0;
if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
- printk(KERN_ALERT "%s" TORTURE_FLAG
- " Call or barrier ops missing for %s,\n",
- torture_type, cur_ops->name);
- printk(KERN_ALERT "%s" TORTURE_FLAG
- " RCU barrier testing omitted from run.\n",
- torture_type);
+ pr_alert("%s" TORTURE_FLAG
+ " Call or barrier ops missing for %s,\n",
+ torture_type, cur_ops->name);
+ pr_alert("%s" TORTURE_FLAG
+ " RCU barrier testing omitted from run.\n",
+ torture_type);
return 0;
}
atomic_set(&barrier_cbs_count, 0);
@@ -1814,7 +1847,7 @@ rcu_torture_cleanup(void)
mutex_lock(&fullstop_mutex);
rcutorture_record_test_transition();
if (fullstop == FULLSTOP_SHUTDOWN) {
- printk(KERN_WARNING /* but going down anyway, so... */
+ pr_warn(/* but going down anyway, so... */
"Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
mutex_unlock(&fullstop_mutex);
schedule_timeout_uninterruptible(10);
@@ -1938,17 +1971,17 @@ rcu_torture_init(void)
break;
}
if (i == ARRAY_SIZE(torture_ops)) {
- printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n",
- torture_type);
- printk(KERN_ALERT "rcu-torture types:");
+ pr_alert("rcu-torture: invalid torture type: \"%s\"\n",
+ torture_type);
+ pr_alert("rcu-torture types:");
for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
- printk(KERN_ALERT " %s", torture_ops[i]->name);
- printk(KERN_ALERT "\n");
+ pr_alert(" %s", torture_ops[i]->name);
+ pr_alert("\n");
mutex_unlock(&fullstop_mutex);
return -EINVAL;
}
if (cur_ops->fqs == NULL && fqs_duration != 0) {
- printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
+ pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n");
fqs_duration = 0;
}
if (cur_ops->init)
@@ -1996,14 +2029,15 @@ rcu_torture_init(void)
/* Start up the kthreads. */
VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
- writer_task = kthread_run(rcu_torture_writer, NULL,
- "rcu_torture_writer");
+ writer_task = kthread_create(rcu_torture_writer, NULL,
+ "rcu_torture_writer");
if (IS_ERR(writer_task)) {
firsterr = PTR_ERR(writer_task);
VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
writer_task = NULL;
goto unwind;
}
+ wake_up_process(writer_task);
fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
GFP_KERNEL);
if (fakewriter_tasks == NULL) {
@@ -2118,14 +2152,15 @@ rcu_torture_init(void)
}
if (shutdown_secs > 0) {
shutdown_time = jiffies + shutdown_secs * HZ;
- shutdown_task = kthread_run(rcu_torture_shutdown, NULL,
- "rcu_torture_shutdown");
+ shutdown_task = kthread_create(rcu_torture_shutdown, NULL,
+ "rcu_torture_shutdown");
if (IS_ERR(shutdown_task)) {
firsterr = PTR_ERR(shutdown_task);
VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
shutdown_task = NULL;
goto unwind;
}
+ wake_up_process(shutdown_task);
}
i = rcu_torture_onoff_init();
if (i != 0) {
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f280e542e3e..4fb2376ddf0 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -52,6 +52,7 @@
#include <linux/prefetch.h>
#include <linux/delay.h>
#include <linux/stop_machine.h>
+#include <linux/random.h>
#include "rcutree.h"
#include <trace/events/rcu.h>
@@ -61,6 +62,7 @@
/* Data structures. */
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
+static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
#define RCU_STATE_INITIALIZER(sname, cr) { \
.level = { &sname##_state.node[0] }, \
@@ -72,7 +74,6 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
.orphan_nxttail = &sname##_state.orphan_nxtlist, \
.orphan_donetail = &sname##_state.orphan_donelist, \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
- .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \
.name = #sname, \
}
@@ -88,7 +89,7 @@ LIST_HEAD(rcu_struct_flavors);
/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */
static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF;
-module_param(rcu_fanout_leaf, int, 0);
+module_param(rcu_fanout_leaf, int, 0444);
int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */
NUM_RCU_LVL_0,
@@ -133,13 +134,12 @@ static int rcu_scheduler_fully_active __read_mostly;
*/
static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
DEFINE_PER_CPU(char, rcu_cpu_has_work);
#endif /* #ifdef CONFIG_RCU_BOOST */
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -175,8 +175,6 @@ void rcu_sched_qs(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
- rdp->passed_quiesce_gpnum = rdp->gpnum;
- barrier();
if (rdp->passed_quiesce == 0)
trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
rdp->passed_quiesce = 1;
@@ -186,8 +184,6 @@ void rcu_bh_qs(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
- rdp->passed_quiesce_gpnum = rdp->gpnum;
- barrier();
if (rdp->passed_quiesce == 0)
trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
rdp->passed_quiesce = 1;
@@ -210,15 +206,18 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
.dynticks = ATOMIC_INIT(1),
+#if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE)
+ .ignore_user_qs = true,
+#endif
};
static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
static int qhimark = 10000; /* If this many pending, ignore blimit. */
static int qlowmark = 100; /* Once only this many pending, use blimit. */
-module_param(blimit, int, 0);
-module_param(qhimark, int, 0);
-module_param(qlowmark, int, 0);
+module_param(blimit, int, 0444);
+module_param(qhimark, int, 0444);
+module_param(qlowmark, int, 0444);
int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
@@ -226,7 +225,14 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_suppress, int, 0644);
module_param(rcu_cpu_stall_timeout, int, 0644);
-static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
+static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS;
+static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
+
+module_param(jiffies_till_first_fqs, ulong, 0644);
+module_param(jiffies_till_next_fqs, ulong, 0644);
+
+static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
+static void force_quiescent_state(struct rcu_state *rsp);
static int rcu_pending(int cpu);
/*
@@ -252,7 +258,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
*/
void rcu_bh_force_quiescent_state(void)
{
- force_quiescent_state(&rcu_bh_state, 0);
+ force_quiescent_state(&rcu_bh_state);
}
EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
@@ -286,7 +292,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_progress);
*/
void rcu_sched_force_quiescent_state(void)
{
- force_quiescent_state(&rcu_sched_state, 0);
+ force_quiescent_state(&rcu_sched_state);
}
EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
@@ -305,7 +311,9 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
static int
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
- return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp);
+ return *rdp->nxttail[RCU_DONE_TAIL +
+ ACCESS_ONCE(rsp->completed) != rdp->completed] &&
+ !rcu_gp_in_progress(rsp);
}
/*
@@ -317,45 +325,17 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
}
/*
- * If the specified CPU is offline, tell the caller that it is in
- * a quiescent state. Otherwise, whack it with a reschedule IPI.
- * Grace periods can end up waiting on an offline CPU when that
- * CPU is in the process of coming online -- it will be added to the
- * rcu_node bitmasks before it actually makes it online. The same thing
- * can happen while a CPU is in the process of coming online. Because this
- * race is quite rare, we check for it after detecting that the grace
- * period has been delayed rather than checking each and every CPU
- * each and every time we start a new grace period.
- */
-static int rcu_implicit_offline_qs(struct rcu_data *rdp)
-{
- /*
- * If the CPU is offline for more than a jiffy, it is in a quiescent
- * state. We can trust its state not to change because interrupts
- * are disabled. The reason for the jiffy's worth of slack is to
- * handle CPUs initializing on the way up and finding their way
- * to the idle loop on the way down.
- */
- if (cpu_is_offline(rdp->cpu) &&
- ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) {
- trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
- rdp->offline_fqs++;
- return 1;
- }
- return 0;
-}
-
-/*
- * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
+ * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state
*
* If the new value of the ->dynticks_nesting counter now is zero,
* we really have entered idle, and must do the appropriate accounting.
* The caller must have disabled interrupts.
*/
-static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
+static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
+ bool user)
{
trace_rcu_dyntick("Start", oldval, 0);
- if (!is_idle_task(current)) {
+ if (!user && !is_idle_task(current)) {
struct task_struct *idle = idle_task(smp_processor_id());
trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
@@ -372,7 +352,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
/*
- * The idle task is not permitted to enter the idle loop while
+ * It is illegal to enter an extended quiescent state while
* in an RCU read-side critical section.
*/
rcu_lockdep_assert(!lock_is_held(&rcu_lock_map),
@@ -383,6 +363,25 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
"Illegal idle entry in RCU-sched read-side critical section.");
}
+/*
+ * Enter an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ */
+static void rcu_eqs_enter(bool user)
+{
+ long long oldval;
+ struct rcu_dynticks *rdtp;
+
+ rdtp = &__get_cpu_var(rcu_dynticks);
+ oldval = rdtp->dynticks_nesting;
+ WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
+ if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
+ rdtp->dynticks_nesting = 0;
+ else
+ rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
+ rcu_eqs_enter_common(rdtp, oldval, user);
+}
+
/**
* rcu_idle_enter - inform RCU that current CPU is entering idle
*
@@ -398,21 +397,70 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
void rcu_idle_enter(void)
{
unsigned long flags;
- long long oldval;
+
+ local_irq_save(flags);
+ rcu_eqs_enter(false);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(rcu_idle_enter);
+
+#ifdef CONFIG_RCU_USER_QS
+/**
+ * rcu_user_enter - inform RCU that we are resuming userspace.
+ *
+ * Enter RCU idle mode right before resuming userspace. No use of RCU
+ * is permitted between this call and rcu_user_exit(). This way the
+ * CPU doesn't need to maintain the tick for RCU maintenance purposes
+ * when the CPU runs in userspace.
+ */
+void rcu_user_enter(void)
+{
+ unsigned long flags;
struct rcu_dynticks *rdtp;
+ /*
+ * Some contexts may involve an exception occuring in an irq,
+ * leading to that nesting:
+ * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+ * This would mess up the dyntick_nesting count though. And rcu_irq_*()
+ * helpers are enough to protect RCU uses inside the exception. So
+ * just return immediately if we detect we are in an IRQ.
+ */
+ if (in_interrupt())
+ return;
+
+ WARN_ON_ONCE(!current->mm);
+
local_irq_save(flags);
rdtp = &__get_cpu_var(rcu_dynticks);
- oldval = rdtp->dynticks_nesting;
- WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
- if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
- rdtp->dynticks_nesting = 0;
- else
- rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
- rcu_idle_enter_common(rdtp, oldval);
+ if (!rdtp->ignore_user_qs && !rdtp->in_user) {
+ rdtp->in_user = true;
+ rcu_eqs_enter(true);
+ }
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
+
+/**
+ * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace
+ * after the current irq returns.
+ *
+ * This is similar to rcu_user_enter() but in the context of a non-nesting
+ * irq. After this call, RCU enters into idle mode when the interrupt
+ * returns.
+ */
+void rcu_user_enter_after_irq(void)
+{
+ unsigned long flags;
+ struct rcu_dynticks *rdtp;
+
+ local_irq_save(flags);
+ rdtp = &__get_cpu_var(rcu_dynticks);
+ /* Ensure this irq is interrupting a non-idle RCU state. */
+ WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK));
+ rdtp->dynticks_nesting = 1;
+ local_irq_restore(flags);
+}
+#endif /* CONFIG_RCU_USER_QS */
/**
* rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
@@ -444,18 +492,19 @@ void rcu_irq_exit(void)
if (rdtp->dynticks_nesting)
trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
else
- rcu_idle_enter_common(rdtp, oldval);
+ rcu_eqs_enter_common(rdtp, oldval, true);
local_irq_restore(flags);
}
/*
- * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle
+ * rcu_eqs_exit_common - current CPU moving away from extended quiescent state
*
* If the new value of the ->dynticks_nesting counter was previously zero,
* we really have exited idle, and must do the appropriate accounting.
* The caller must have disabled interrupts.
*/
-static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
+static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
+ int user)
{
smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
atomic_inc(&rdtp->dynticks);
@@ -464,7 +513,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
rcu_cleanup_after_idle(smp_processor_id());
trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
- if (!is_idle_task(current)) {
+ if (!user && !is_idle_task(current)) {
struct task_struct *idle = idle_task(smp_processor_id());
trace_rcu_dyntick("Error on exit: not idle task",
@@ -476,6 +525,25 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
}
}
+/*
+ * Exit an RCU extended quiescent state, which can be either the
+ * idle loop or adaptive-tickless usermode execution.
+ */
+static void rcu_eqs_exit(bool user)
+{
+ struct rcu_dynticks *rdtp;
+ long long oldval;
+
+ rdtp = &__get_cpu_var(rcu_dynticks);
+ oldval = rdtp->dynticks_nesting;
+ WARN_ON_ONCE(oldval < 0);
+ if (oldval & DYNTICK_TASK_NEST_MASK)
+ rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
+ else
+ rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
+ rcu_eqs_exit_common(rdtp, oldval, user);
+}
+
/**
* rcu_idle_exit - inform RCU that current CPU is leaving idle
*
@@ -490,21 +558,67 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
void rcu_idle_exit(void)
{
unsigned long flags;
+
+ local_irq_save(flags);
+ rcu_eqs_exit(false);
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(rcu_idle_exit);
+
+#ifdef CONFIG_RCU_USER_QS
+/**
+ * rcu_user_exit - inform RCU that we are exiting userspace.
+ *
+ * Exit RCU idle mode while entering the kernel because it can
+ * run a RCU read side critical section anytime.
+ */
+void rcu_user_exit(void)
+{
+ unsigned long flags;
struct rcu_dynticks *rdtp;
- long long oldval;
+
+ /*
+ * Some contexts may involve an exception occuring in an irq,
+ * leading to that nesting:
+ * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+ * This would mess up the dyntick_nesting count though. And rcu_irq_*()
+ * helpers are enough to protect RCU uses inside the exception. So
+ * just return immediately if we detect we are in an IRQ.
+ */
+ if (in_interrupt())
+ return;
local_irq_save(flags);
rdtp = &__get_cpu_var(rcu_dynticks);
- oldval = rdtp->dynticks_nesting;
- WARN_ON_ONCE(oldval < 0);
- if (oldval & DYNTICK_TASK_NEST_MASK)
- rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
- else
- rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_idle_exit_common(rdtp, oldval);
+ if (rdtp->in_user) {
+ rdtp->in_user = false;
+ rcu_eqs_exit(true);
+ }
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
+
+/**
+ * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace
+ * idle mode after the current non-nesting irq returns.
+ *
+ * This is similar to rcu_user_exit() but in the context of an irq.
+ * This is called when the irq has interrupted a userspace RCU idle mode
+ * context. When the current non-nesting interrupt returns after this call,
+ * the CPU won't restore the RCU idle mode.
+ */
+void rcu_user_exit_after_irq(void)
+{
+ unsigned long flags;
+ struct rcu_dynticks *rdtp;
+
+ local_irq_save(flags);
+ rdtp = &__get_cpu_var(rcu_dynticks);
+ /* Ensure we are interrupting an RCU idle mode. */
+ WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK);
+ rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE;
+ local_irq_restore(flags);
+}
+#endif /* CONFIG_RCU_USER_QS */
/**
* rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -539,7 +653,7 @@ void rcu_irq_enter(void)
if (oldval)
trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
else
- rcu_idle_exit_common(rdtp, oldval);
+ rcu_eqs_exit_common(rdtp, oldval, true);
local_irq_restore(flags);
}
@@ -603,6 +717,21 @@ int rcu_is_cpu_idle(void)
}
EXPORT_SYMBOL(rcu_is_cpu_idle);
+#ifdef CONFIG_RCU_USER_QS
+void rcu_user_hooks_switch(struct task_struct *prev,
+ struct task_struct *next)
+{
+ struct rcu_dynticks *rdtp;
+
+ /* Interrupts are disabled in context switch */
+ rdtp = &__get_cpu_var(rcu_dynticks);
+ if (!rdtp->ignore_user_qs) {
+ clear_tsk_thread_flag(prev, TIF_NOHZ);
+ set_tsk_thread_flag(next, TIF_NOHZ);
+ }
+}
+#endif /* #ifdef CONFIG_RCU_USER_QS */
+
#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
/*
@@ -673,7 +802,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
* Return true if the specified CPU has passed through a quiescent
* state by virtue of being in or having passed through an dynticks
* idle state since the last call to dyntick_save_progress_counter()
- * for this same CPU.
+ * for this same CPU, or by virtue of having been offline.
*/
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
@@ -697,8 +826,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
return 1;
}
- /* Go check for the CPU being offline. */
- return rcu_implicit_offline_qs(rdp);
+ /*
+ * Check for the CPU being offline, but only if the grace period
+ * is old enough. We don't need to worry about the CPU changing
+ * state: If we see it offline even once, it has been through a
+ * quiescent state.
+ *
+ * The reason for insisting that the grace period be at least
+ * one jiffy old is that CPUs that are not quite online and that
+ * have just gone offline can still execute RCU read-side critical
+ * sections.
+ */
+ if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies))
+ return 0; /* Grace period is not old enough. */
+ barrier();
+ if (cpu_is_offline(rdp->cpu)) {
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
+ rdp->offline_fqs++;
+ return 1;
+ }
+ return 0;
}
static int jiffies_till_stall_check(void)
@@ -755,14 +902,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
rcu_for_each_leaf_node(rsp, rnp) {
raw_spin_lock_irqsave(&rnp->lock, flags);
ndetected += rcu_print_task_stall(rnp);
+ if (rnp->qsmask != 0) {
+ for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
+ if (rnp->qsmask & (1UL << cpu)) {
+ print_cpu_stall_info(rsp,
+ rnp->grplo + cpu);
+ ndetected++;
+ }
+ }
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- if (rnp->qsmask == 0)
- continue;
- for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
- if (rnp->qsmask & (1UL << cpu)) {
- print_cpu_stall_info(rsp, rnp->grplo + cpu);
- ndetected++;
- }
}
/*
@@ -782,11 +930,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
else if (!trigger_all_cpu_backtrace())
dump_stack();
- /* If so configured, complain about tasks blocking the grace period. */
+ /* Complain about tasks blocking the grace period. */
rcu_print_detail_task_stall(rsp);
- force_quiescent_state(rsp, 0); /* Kick them all. */
+ force_quiescent_state(rsp); /* Kick them all. */
}
static void print_cpu_stall(struct rcu_state *rsp)
@@ -827,7 +975,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
j = ACCESS_ONCE(jiffies);
js = ACCESS_ONCE(rsp->jiffies_stall);
rnp = rdp->mynode;
- if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
+ if (rcu_gp_in_progress(rsp) &&
+ (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
/* We haven't checked in, so go dump stack. */
print_cpu_stall(rsp);
@@ -889,12 +1038,8 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
*/
rdp->gpnum = rnp->gpnum;
trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
- if (rnp->qsmask & rdp->grpmask) {
- rdp->qs_pending = 1;
- rdp->passed_quiesce = 0;
- } else {
- rdp->qs_pending = 0;
- }
+ rdp->passed_quiesce = 0;
+ rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask);
zero_cpu_stall_ticks(rdp);
}
}
@@ -974,10 +1119,13 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
* our behalf. Catch up with this state to avoid noting
* spurious new grace periods. If another grace period
* has started, then rnp->gpnum will have advanced, so
- * we will detect this later on.
+ * we will detect this later on. Of course, any quiescent
+ * states we found for the old GP are now invalid.
*/
- if (ULONG_CMP_LT(rdp->gpnum, rdp->completed))
+ if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) {
rdp->gpnum = rdp->completed;
+ rdp->passed_quiesce = 0;
+ }
/*
* If RCU does not need a quiescent state from this CPU,
@@ -1021,97 +1169,56 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
/* Prior grace period ended, so advance callbacks for current CPU. */
__rcu_process_gp_end(rsp, rnp, rdp);
- /*
- * Because this CPU just now started the new grace period, we know
- * that all of its callbacks will be covered by this upcoming grace
- * period, even the ones that were registered arbitrarily recently.
- * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
- *
- * Other CPUs cannot be sure exactly when the grace period started.
- * Therefore, their recently registered callbacks must pass through
- * an additional RCU_NEXT_READY stage, so that they will be handled
- * by the next RCU grace period.
- */
- rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
- rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-
/* Set state so that this CPU will detect the next quiescent state. */
__note_new_gpnum(rsp, rnp, rdp);
}
/*
- * Start a new RCU grace period if warranted, re-initializing the hierarchy
- * in preparation for detecting the next grace period. The caller must hold
- * the root node's ->lock, which is released before return. Hard irqs must
- * be disabled.
- *
- * Note that it is legal for a dying CPU (which is marked as offline) to
- * invoke this function. This can happen when the dying CPU reports its
- * quiescent state.
+ * Initialize a new grace period.
*/
-static void
-rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
- __releases(rcu_get_root(rsp)->lock)
+static int rcu_gp_init(struct rcu_state *rsp)
{
- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+ struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
- if (!rcu_scheduler_fully_active ||
- !cpu_needs_another_gp(rsp, rdp)) {
- /*
- * Either the scheduler hasn't yet spawned the first
- * non-idle task or this CPU does not need another
- * grace period. Either way, don't start a new grace
- * period.
- */
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
+ raw_spin_lock_irq(&rnp->lock);
+ rsp->gp_flags = 0; /* Clear all flags: New grace period. */
- if (rsp->fqs_active) {
- /*
- * This CPU needs a grace period, but force_quiescent_state()
- * is running. Tell it to start one on this CPU's behalf.
- */
- rsp->fqs_need_gp = 1;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
+ if (rcu_gp_in_progress(rsp)) {
+ /* Grace period already in progress, don't start another. */
+ raw_spin_unlock_irq(&rnp->lock);
+ return 0;
}
/* Advance to a new grace period and initialize state. */
rsp->gpnum++;
trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
- WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
- rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
- rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
record_gp_stall_check_time(rsp);
- raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */
+ raw_spin_unlock_irq(&rnp->lock);
/* Exclude any concurrent CPU-hotplug operations. */
- raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
+ get_online_cpus();
/*
* Set the quiescent-state-needed bits in all the rcu_node
- * structures for all currently online CPUs in breadth-first
- * order, starting from the root rcu_node structure. This
- * operation relies on the layout of the hierarchy within the
- * rsp->node[] array. Note that other CPUs will access only
- * the leaves of the hierarchy, which still indicate that no
+ * structures for all currently online CPUs in breadth-first order,
+ * starting from the root rcu_node structure, relying on the layout
+ * of the tree within the rsp->node[] array. Note that other CPUs
+ * will access only the leaves of the hierarchy, thus seeing that no
* grace period is in progress, at least until the corresponding
* leaf node has been initialized. In addition, we have excluded
* CPU-hotplug operations.
*
- * Note that the grace period cannot complete until we finish
- * the initialization process, as there will be at least one
- * qsmask bit set in the root node until that time, namely the
- * one corresponding to this CPU, due to the fact that we have
- * irqs disabled.
+ * The grace period cannot complete until the initialization
+ * process finishes, because this kthread handles both.
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ raw_spin_lock_irq(&rnp->lock);
+ rdp = this_cpu_ptr(rsp->rda);
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
rnp->gpnum = rsp->gpnum;
+ WARN_ON_ONCE(rnp->completed != rsp->completed);
rnp->completed = rsp->completed;
if (rnp == rdp->mynode)
rcu_start_gp_per_cpu(rsp, rnp, rdp);
@@ -1119,37 +1226,54 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_unlock_irq(&rnp->lock);
+#ifdef CONFIG_PROVE_RCU_DELAY
+ if ((random32() % (rcu_num_nodes * 8)) == 0)
+ schedule_timeout_uninterruptible(2);
+#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
+ cond_resched();
}
- rnp = rcu_get_root(rsp);
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+ put_online_cpus();
+ return 1;
}
/*
- * Report a full set of quiescent states to the specified rcu_state
- * data structure. This involves cleaning up after the prior grace
- * period and letting rcu_start_gp() start up the next grace period
- * if one is needed. Note that the caller must hold rnp->lock, as
- * required by rcu_start_gp(), which will release it.
+ * Do one round of quiescent-state forcing.
*/
-static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
- __releases(rcu_get_root(rsp)->lock)
+int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
{
- unsigned long gp_duration;
+ int fqs_state = fqs_state_in;
struct rcu_node *rnp = rcu_get_root(rsp);
- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+ rsp->n_force_qs++;
+ if (fqs_state == RCU_SAVE_DYNTICK) {
+ /* Collect dyntick-idle snapshots. */
+ force_qs_rnp(rsp, dyntick_save_progress_counter);
+ fqs_state = RCU_FORCE_QS;
+ } else {
+ /* Handle dyntick-idle and offline CPUs. */
+ force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
+ }
+ /* Clear flag to prevent immediate re-entry. */
+ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+ raw_spin_lock_irq(&rnp->lock);
+ rsp->gp_flags &= ~RCU_GP_FLAG_FQS;
+ raw_spin_unlock_irq(&rnp->lock);
+ }
+ return fqs_state;
+}
- /*
- * Ensure that all grace-period and pre-grace-period activity
- * is seen before the assignment to rsp->completed.
- */
- smp_mb(); /* See above block comment. */
+/*
+ * Clean up after the old grace period.
+ */
+static void rcu_gp_cleanup(struct rcu_state *rsp)
+{
+ unsigned long gp_duration;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ raw_spin_lock_irq(&rnp->lock);
gp_duration = jiffies - rsp->gp_start;
if (gp_duration > rsp->gp_max)
rsp->gp_max = gp_duration;
@@ -1161,35 +1285,149 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
* they can do to advance the grace period. It is therefore
* safe for us to drop the lock in order to mark the grace
* period as completed in all of the rcu_node structures.
- *
- * But if this CPU needs another grace period, it will take
- * care of this while initializing the next grace period.
- * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
- * because the callbacks have not yet been advanced: Those
- * callbacks are waiting on the grace period that just now
- * completed.
*/
- if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_unlock_irq(&rnp->lock);
- /*
- * Propagate new ->completed value to rcu_node structures
- * so that other CPUs don't have to wait until the start
- * of the next grace period to process their callbacks.
- */
- rcu_for_each_node_breadth_first(rsp, rnp) {
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- rnp->completed = rsp->gpnum;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- }
- rnp = rcu_get_root(rsp);
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ /*
+ * Propagate new ->completed value to rcu_node structures so
+ * that other CPUs don't have to wait until the start of the next
+ * grace period to process their callbacks. This also avoids
+ * some nasty RCU grace-period initialization races by forcing
+ * the end of the current grace period to be completely recorded in
+ * all of the rcu_node structures before the beginning of the next
+ * grace period is recorded in any of the rcu_node structures.
+ */
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ raw_spin_lock_irq(&rnp->lock);
+ rnp->completed = rsp->gpnum;
+ raw_spin_unlock_irq(&rnp->lock);
+ cond_resched();
}
+ rnp = rcu_get_root(rsp);
+ raw_spin_lock_irq(&rnp->lock);
- rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
+ rsp->completed = rsp->gpnum; /* Declare grace period done. */
trace_rcu_grace_period(rsp->name, rsp->completed, "end");
rsp->fqs_state = RCU_GP_IDLE;
- rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
+ rdp = this_cpu_ptr(rsp->rda);
+ if (cpu_needs_another_gp(rsp, rdp))
+ rsp->gp_flags = 1;
+ raw_spin_unlock_irq(&rnp->lock);
+}
+
+/*
+ * Body of kthread that handles grace periods.
+ */
+static int __noreturn rcu_gp_kthread(void *arg)
+{
+ int fqs_state;
+ unsigned long j;
+ int ret;
+ struct rcu_state *rsp = arg;
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ for (;;) {
+
+ /* Handle grace-period start. */
+ for (;;) {
+ wait_event_interruptible(rsp->gp_wq,
+ rsp->gp_flags &
+ RCU_GP_FLAG_INIT);
+ if ((rsp->gp_flags & RCU_GP_FLAG_INIT) &&
+ rcu_gp_init(rsp))
+ break;
+ cond_resched();
+ flush_signals(current);
+ }
+
+ /* Handle quiescent-state forcing. */
+ fqs_state = RCU_SAVE_DYNTICK;
+ j = jiffies_till_first_fqs;
+ if (j > HZ) {
+ j = HZ;
+ jiffies_till_first_fqs = HZ;
+ }
+ for (;;) {
+ rsp->jiffies_force_qs = jiffies + j;
+ ret = wait_event_interruptible_timeout(rsp->gp_wq,
+ (rsp->gp_flags & RCU_GP_FLAG_FQS) ||
+ (!ACCESS_ONCE(rnp->qsmask) &&
+ !rcu_preempt_blocked_readers_cgp(rnp)),
+ j);
+ /* If grace period done, leave loop. */
+ if (!ACCESS_ONCE(rnp->qsmask) &&
+ !rcu_preempt_blocked_readers_cgp(rnp))
+ break;
+ /* If time for quiescent-state forcing, do it. */
+ if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) {
+ fqs_state = rcu_gp_fqs(rsp, fqs_state);
+ cond_resched();
+ } else {
+ /* Deal with stray signal. */
+ cond_resched();
+ flush_signals(current);
+ }
+ j = jiffies_till_next_fqs;
+ if (j > HZ) {
+ j = HZ;
+ jiffies_till_next_fqs = HZ;
+ } else if (j < 1) {
+ j = 1;
+ jiffies_till_next_fqs = 1;
+ }
+ }
+
+ /* Handle grace-period end. */
+ rcu_gp_cleanup(rsp);
+ }
+}
+
+/*
+ * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * in preparation for detecting the next grace period. The caller must hold
+ * the root node's ->lock, which is released before return. Hard irqs must
+ * be disabled.
+ *
+ * Note that it is legal for a dying CPU (which is marked as offline) to
+ * invoke this function. This can happen when the dying CPU reports its
+ * quiescent state.
+ */
+static void
+rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
+ __releases(rcu_get_root(rsp)->lock)
+{
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ if (!rsp->gp_kthread ||
+ !cpu_needs_another_gp(rsp, rdp)) {
+ /*
+ * Either we have not yet spawned the grace-period
+ * task or this CPU does not need another grace period.
+ * Either way, don't start a new grace period.
+ */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+
+ rsp->gp_flags = RCU_GP_FLAG_INIT;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ wake_up(&rsp->gp_wq);
+}
+
+/*
+ * Report a full set of quiescent states to the specified rcu_state
+ * data structure. This involves cleaning up after the prior grace
+ * period and letting rcu_start_gp() start up the next grace period
+ * if one is needed. Note that the caller must hold rnp->lock, as
+ * required by rcu_start_gp(), which will release it.
+ */
+static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
+ __releases(rcu_get_root(rsp)->lock)
+{
+ WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+ raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
+ wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
}
/*
@@ -1258,7 +1496,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
* based on quiescent states detected in an earlier grace period!
*/
static void
-rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp)
+rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
unsigned long mask;
@@ -1266,7 +1504,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
- if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) {
+ if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum ||
+ rnp->completed == rnp->gpnum) {
/*
* The grace period in which this quiescent state was
@@ -1325,7 +1564,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
* Tell RCU we are done (but rcu_report_qs_rdp() will be the
* judge of that).
*/
- rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum);
+ rcu_report_qs_rdp(rdp->cpu, rsp, rdp);
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -1390,17 +1629,6 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
int i;
struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);
- /*
- * If there is an rcu_barrier() operation in progress, then
- * only the task doing that operation is permitted to adopt
- * callbacks. To do otherwise breaks rcu_barrier() and friends
- * by causing them to fail to wait for the callbacks in the
- * orphanage.
- */
- if (rsp->rcu_barrier_in_progress &&
- rsp->rcu_barrier_in_progress != current)
- return;
-
/* Do the accounting first. */
rdp->qlen_lazy += rsp->qlen_lazy;
rdp->qlen += rsp->qlen;
@@ -1455,9 +1683,8 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
* The CPU has been completely removed, and some other CPU is reporting
* this fact from process context. Do the remainder of the cleanup,
* including orphaning the outgoing CPU's RCU callbacks, and also
- * adopting them, if there is no _rcu_barrier() instance running.
- * There can only be one CPU hotplug operation at a time, so no other
- * CPU can be attempting to update rcu_cpu_kthread_task.
+ * adopting them. There can only be one CPU hotplug operation at a time,
+ * so no other CPU can be attempting to update rcu_cpu_kthread_task.
*/
static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
{
@@ -1468,8 +1695,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
/* Adjust any no-longer-needed kthreads. */
- rcu_stop_cpu_kthread(cpu);
- rcu_node_kthread_setaffinity(rnp, -1);
+ rcu_boost_kthread_setaffinity(rnp, -1);
/* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
@@ -1515,14 +1741,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
"rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
cpu, rdp->qlen, rdp->nxtlist);
+ init_callback_list(rdp);
+ /* Disallow further callbacks on this CPU. */
+ rdp->nxttail[RCU_NEXT_TAIL] = NULL;
}
#else /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
-{
-}
-
static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
{
}
@@ -1687,6 +1912,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
struct rcu_node *rnp;
rcu_for_each_leaf_node(rsp, rnp) {
+ cond_resched();
mask = 0;
raw_spin_lock_irqsave(&rnp->lock, flags);
if (!rcu_gp_in_progress(rsp)) {
@@ -1723,72 +1949,39 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
* Force quiescent states on reluctant CPUs, and also detect which
* CPUs are in dyntick-idle mode.
*/
-static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+static void force_quiescent_state(struct rcu_state *rsp)
{
unsigned long flags;
- struct rcu_node *rnp = rcu_get_root(rsp);
-
- trace_rcu_utilization("Start fqs");
- if (!rcu_gp_in_progress(rsp)) {
- trace_rcu_utilization("End fqs");
- return; /* No grace period in progress, nothing to force. */
- }
- if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
- rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
- trace_rcu_utilization("End fqs");
- return; /* Someone else is already on the job. */
- }
- if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
- goto unlock_fqs_ret; /* no emergency and done recently. */
- rsp->n_force_qs++;
- raw_spin_lock(&rnp->lock); /* irqs already disabled */
- rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
- if(!rcu_gp_in_progress(rsp)) {
- rsp->n_force_qs_ngp++;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
- goto unlock_fqs_ret; /* no GP in progress, time updated. */
- }
- rsp->fqs_active = 1;
- switch (rsp->fqs_state) {
- case RCU_GP_IDLE:
- case RCU_GP_INIT:
-
- break; /* grace period idle or initializing, ignore. */
-
- case RCU_SAVE_DYNTICK:
-
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
-
- /* Record dyntick-idle state. */
- force_qs_rnp(rsp, dyntick_save_progress_counter);
- raw_spin_lock(&rnp->lock); /* irqs already disabled */
- if (rcu_gp_in_progress(rsp))
- rsp->fqs_state = RCU_FORCE_QS;
- break;
-
- case RCU_FORCE_QS:
-
- /* Check dyntick-idle state, send IPI to laggarts. */
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
- force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
-
- /* Leave state in case more forcing is required. */
-
- raw_spin_lock(&rnp->lock); /* irqs already disabled */
- break;
+ bool ret;
+ struct rcu_node *rnp;
+ struct rcu_node *rnp_old = NULL;
+
+ /* Funnel through hierarchy to reduce memory contention. */
+ rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+ for (; rnp != NULL; rnp = rnp->parent) {
+ ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
+ !raw_spin_trylock(&rnp->fqslock);
+ if (rnp_old != NULL)
+ raw_spin_unlock(&rnp_old->fqslock);
+ if (ret) {
+ rsp->n_force_qs_lh++;
+ return;
+ }
+ rnp_old = rnp;
}
- rsp->fqs_active = 0;
- if (rsp->fqs_need_gp) {
- raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
- rsp->fqs_need_gp = 0;
- rcu_start_gp(rsp, flags); /* releases rnp->lock */
- trace_rcu_utilization("End fqs");
- return;
+ /* rnp_old == rcu_get_root(rsp), rnp == NULL. */
+
+ /* Reached the root of the rcu_node tree, acquire lock. */
+ raw_spin_lock_irqsave(&rnp_old->lock, flags);
+ raw_spin_unlock(&rnp_old->fqslock);
+ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
+ rsp->n_force_qs_lh++;
+ raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
+ return; /* Someone beat us to it. */
}
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
-unlock_fqs_ret:
- raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
- trace_rcu_utilization("End fqs");
+ rsp->gp_flags |= RCU_GP_FLAG_FQS;
+ raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
+ wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */
}
/*
@@ -1805,13 +1998,6 @@ __rcu_process_callbacks(struct rcu_state *rsp)
WARN_ON_ONCE(rdp->beenonline == 0);
/*
- * If an RCU GP has gone long enough, go check for dyntick
- * idle CPUs and, if needed, send resched IPIs.
- */
- if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
- force_quiescent_state(rsp, 1);
-
- /*
* Advance callbacks in response to end of earlier grace
* period that some other CPU ended.
*/
@@ -1838,6 +2024,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
{
struct rcu_state *rsp;
+ if (cpu_is_offline(smp_processor_id()))
+ return;
trace_rcu_utilization("Start RCU core");
for_each_rcu_flavor(rsp)
__rcu_process_callbacks(rsp);
@@ -1909,12 +2097,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
rdp->blimit = LONG_MAX;
if (rsp->n_force_qs == rdp->n_force_qs_snap &&
*rdp->nxttail[RCU_DONE_TAIL] != head)
- force_quiescent_state(rsp, 0);
+ force_quiescent_state(rsp);
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->qlen_last_fqs_check = rdp->qlen;
}
- } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
- force_quiescent_state(rsp, 1);
+ }
}
static void
@@ -1929,8 +2116,6 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
head->func = func;
head->next = NULL;
- smp_mb(); /* Ensure RCU update seen before callback registry. */
-
/*
* Opportunistically note grace-period endings and beginnings.
* Note that we might see a beginning right after we see an
@@ -1941,6 +2126,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
rdp = this_cpu_ptr(rsp->rda);
/* Add the callback to our list. */
+ if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) {
+ /* _call_rcu() is illegal on offline CPU; leak the callback. */
+ WARN_ON_ONCE(1);
+ local_irq_restore(flags);
+ return;
+ }
ACCESS_ONCE(rdp->qlen)++;
if (lazy)
rdp->qlen_lazy++;
@@ -2195,17 +2386,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
/* Is the RCU core waiting for a quiescent state from this CPU? */
if (rcu_scheduler_fully_active &&
rdp->qs_pending && !rdp->passed_quiesce) {
-
- /*
- * If force_quiescent_state() coming soon and this CPU
- * needs a quiescent state, and this is either RCU-sched
- * or RCU-bh, force a local reschedule.
- */
rdp->n_rp_qs_pending++;
- if (!rdp->preemptible &&
- ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
- jiffies))
- set_need_resched();
} else if (rdp->qs_pending && rdp->passed_quiesce) {
rdp->n_rp_report_qs++;
return 1;
@@ -2235,13 +2416,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
return 1;
}
- /* Has an RCU GP gone long enough to send resched IPIs &c? */
- if (rcu_gp_in_progress(rsp) &&
- ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
- rdp->n_rp_need_fqs++;
- return 1;
- }
-
/* nothing to do */
rdp->n_rp_need_nothing++;
return 0;
@@ -2326,13 +2500,10 @@ static void rcu_barrier_func(void *type)
static void _rcu_barrier(struct rcu_state *rsp)
{
int cpu;
- unsigned long flags;
struct rcu_data *rdp;
- struct rcu_data rd;
unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done);
unsigned long snap_done;
- init_rcu_head_on_stack(&rd.barrier_head);
_rcu_barrier_trace(rsp, "Begin", -1, snap);
/* Take mutex to serialize concurrent rcu_barrier() requests. */
@@ -2372,70 +2543,30 @@ static void _rcu_barrier(struct rcu_state *rsp)
/*
* Initialize the count to one rather than to zero in order to
* avoid a too-soon return to zero in case of a short grace period
- * (or preemption of this task). Also flag this task as doing
- * an rcu_barrier(). This will prevent anyone else from adopting
- * orphaned callbacks, which could cause otherwise failure if a
- * CPU went offline and quickly came back online. To see this,
- * consider the following sequence of events:
- *
- * 1. We cause CPU 0 to post an rcu_barrier_callback() callback.
- * 2. CPU 1 goes offline, orphaning its callbacks.
- * 3. CPU 0 adopts CPU 1's orphaned callbacks.
- * 4. CPU 1 comes back online.
- * 5. We cause CPU 1 to post an rcu_barrier_callback() callback.
- * 6. Both rcu_barrier_callback() callbacks are invoked, awakening
- * us -- but before CPU 1's orphaned callbacks are invoked!!!
+ * (or preemption of this task). Exclude CPU-hotplug operations
+ * to ensure that no offline CPU has callbacks queued.
*/
init_completion(&rsp->barrier_completion);
atomic_set(&rsp->barrier_cpu_count, 1);
- raw_spin_lock_irqsave(&rsp->onofflock, flags);
- rsp->rcu_barrier_in_progress = current;
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+ get_online_cpus();
/*
- * Force every CPU with callbacks to register a new callback
- * that will tell us when all the preceding callbacks have
- * been invoked. If an offline CPU has callbacks, wait for
- * it to either come back online or to finish orphaning those
- * callbacks.
+ * Force each CPU with callbacks to register a new callback.
+ * When that callback is invoked, we will know that all of the
+ * corresponding CPU's preceding callbacks have been invoked.
*/
- for_each_possible_cpu(cpu) {
- preempt_disable();
+ for_each_online_cpu(cpu) {
rdp = per_cpu_ptr(rsp->rda, cpu);
- if (cpu_is_offline(cpu)) {
- _rcu_barrier_trace(rsp, "Offline", cpu,
- rsp->n_barrier_done);
- preempt_enable();
- while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))
- schedule_timeout_interruptible(1);
- } else if (ACCESS_ONCE(rdp->qlen)) {
+ if (ACCESS_ONCE(rdp->qlen)) {
_rcu_barrier_trace(rsp, "OnlineQ", cpu,
rsp->n_barrier_done);
smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
- preempt_enable();
} else {
_rcu_barrier_trace(rsp, "OnlineNQ", cpu,
rsp->n_barrier_done);
- preempt_enable();
}
}
-
- /*
- * Now that all online CPUs have rcu_barrier_callback() callbacks
- * posted, we can adopt all of the orphaned callbacks and place
- * an rcu_barrier_callback() callback after them. When that is done,
- * we are guaranteed to have an rcu_barrier_callback() callback
- * following every callback that could possibly have been
- * registered before _rcu_barrier() was called.
- */
- raw_spin_lock_irqsave(&rsp->onofflock, flags);
- rcu_adopt_orphan_cbs(rsp);
- rsp->rcu_barrier_in_progress = NULL;
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
- atomic_inc(&rsp->barrier_cpu_count);
- smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */
- rd.rsp = rsp;
- rsp->call(&rd.barrier_head, rcu_barrier_callback);
+ put_online_cpus();
/*
* Now that we have an rcu_barrier_callback() callback on each
@@ -2456,8 +2587,6 @@ static void _rcu_barrier(struct rcu_state *rsp)
/* Other rcu_barrier() invocations can now safely proceed. */
mutex_unlock(&rsp->barrier_mutex);
-
- destroy_rcu_head_on_stack(&rd.barrier_head);
}
/**
@@ -2497,6 +2626,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
+#ifdef CONFIG_RCU_USER_QS
+ WARN_ON_ONCE(rdp->dynticks->in_user);
+#endif
rdp->cpu = cpu;
rdp->rsp = rsp;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -2523,6 +2655,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
+ init_callback_list(rdp); /* Re-enable callbacks on this CPU. */
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
atomic_set(&rdp->dynticks->dynticks,
(atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
@@ -2555,7 +2688,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
rdp->completed = rnp->completed;
rdp->passed_quiesce = 0;
rdp->qs_pending = 0;
- rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
}
raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
@@ -2594,12 +2726,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
break;
case CPU_ONLINE:
case CPU_DOWN_FAILED:
- rcu_node_kthread_setaffinity(rnp, -1);
- rcu_cpu_kthread_setrt(cpu, 1);
+ rcu_boost_kthread_setaffinity(rnp, -1);
break;
case CPU_DOWN_PREPARE:
- rcu_node_kthread_setaffinity(rnp, cpu);
- rcu_cpu_kthread_setrt(cpu, 0);
+ rcu_boost_kthread_setaffinity(rnp, cpu);
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
@@ -2627,6 +2757,28 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
}
/*
+ * Spawn the kthread that handles this RCU flavor's grace periods.
+ */
+static int __init rcu_spawn_gp_kthread(void)
+{
+ unsigned long flags;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+ struct task_struct *t;
+
+ for_each_rcu_flavor(rsp) {
+ t = kthread_run(rcu_gp_kthread, rsp, rsp->name);
+ BUG_ON(IS_ERR(t));
+ rnp = rcu_get_root(rsp);
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ rsp->gp_kthread = t;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+ return 0;
+}
+early_initcall(rcu_spawn_gp_kthread);
+
+/*
* This function is invoked towards the end of the scheduler's initialization
* process. Before this is called, the idle task might contain
* RCU read-side critical sections (during which time, this idle
@@ -2661,7 +2813,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
int cprv;
int i;
- cprv = NR_CPUS;
+ cprv = nr_cpu_ids;
for (i = rcu_num_lvls - 1; i >= 0; i--) {
ccur = rsp->levelcnt[i];
rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
@@ -2676,10 +2828,14 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
static void __init rcu_init_one(struct rcu_state *rsp,
struct rcu_data __percpu *rda)
{
- static char *buf[] = { "rcu_node_level_0",
- "rcu_node_level_1",
- "rcu_node_level_2",
- "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */
+ static char *buf[] = { "rcu_node_0",
+ "rcu_node_1",
+ "rcu_node_2",
+ "rcu_node_3" }; /* Match MAX_RCU_LVLS */
+ static char *fqs[] = { "rcu_node_fqs_0",
+ "rcu_node_fqs_1",
+ "rcu_node_fqs_2",
+ "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
int cpustride = 1;
int i;
int j;
@@ -2704,7 +2860,11 @@ static void __init rcu_init_one(struct rcu_state *rsp,
raw_spin_lock_init(&rnp->lock);
lockdep_set_class_and_name(&rnp->lock,
&rcu_node_class[i], buf[i]);
- rnp->gpnum = 0;
+ raw_spin_lock_init(&rnp->fqslock);
+ lockdep_set_class_and_name(&rnp->fqslock,
+ &rcu_fqs_class[i], fqs[i]);
+ rnp->gpnum = rsp->gpnum;
+ rnp->completed = rsp->completed;
rnp->qsmask = 0;
rnp->qsmaskinit = 0;
rnp->grplo = j * cpustride;
@@ -2727,6 +2887,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
}
rsp->rda = rda;
+ init_waitqueue_head(&rsp->gp_wq);
rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
while (i > rnp->grphi)
@@ -2750,7 +2911,8 @@ static void __init rcu_init_geometry(void)
int rcu_capacity[MAX_RCU_LVLS + 1];
/* If the compile-time values are accurate, just leave. */
- if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF)
+ if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF &&
+ nr_cpu_ids == NR_CPUS)
return;
/*
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4d29169f212..5faf05d6832 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -102,6 +102,10 @@ struct rcu_dynticks {
/* idle-period nonlazy_posted snapshot. */
int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
+#ifdef CONFIG_RCU_USER_QS
+ bool ignore_user_qs; /* Treat userspace as extended QS or not */
+ bool in_user; /* Is the CPU in userland from RCU POV? */
+#endif
};
/* RCU's kthread states for tracing. */
@@ -196,12 +200,7 @@ struct rcu_node {
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
#endif /* #ifdef CONFIG_RCU_BOOST */
- struct task_struct *node_kthread_task;
- /* kthread that takes care of this rcu_node */
- /* structure, for example, awakening the */
- /* per-CPU kthreads as needed. */
- unsigned int node_kthread_status;
- /* State of node_kthread_task for tracing. */
+ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
} ____cacheline_internodealigned_in_smp;
/*
@@ -245,8 +244,6 @@ struct rcu_data {
/* in order to detect GP end. */
unsigned long gpnum; /* Highest gp number that this CPU */
/* is aware of having started. */
- unsigned long passed_quiesce_gpnum;
- /* gpnum at time of quiescent state. */
bool passed_quiesce; /* User-mode/idle loop etc. */
bool qs_pending; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
@@ -312,11 +309,13 @@ struct rcu_data {
unsigned long n_rp_cpu_needs_gp;
unsigned long n_rp_gp_completed;
unsigned long n_rp_gp_started;
- unsigned long n_rp_need_fqs;
unsigned long n_rp_need_nothing;
- /* 6) _rcu_barrier() callback. */
+ /* 6) _rcu_barrier() and OOM callbacks. */
struct rcu_head barrier_head;
+#ifdef CONFIG_RCU_FAST_NO_HZ
+ struct rcu_head oom_head;
+#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
int cpu;
struct rcu_state *rsp;
@@ -375,20 +374,17 @@ struct rcu_state {
u8 fqs_state ____cacheline_internodealigned_in_smp;
/* Force QS state. */
- u8 fqs_active; /* force_quiescent_state() */
- /* is running. */
- u8 fqs_need_gp; /* A CPU was prevented from */
- /* starting a new grace */
- /* period because */
- /* force_quiescent_state() */
- /* was running. */
u8 boost; /* Subject to priority boost. */
unsigned long gpnum; /* Current gp number. */
unsigned long completed; /* # of last completed gp. */
+ struct task_struct *gp_kthread; /* Task for grace periods. */
+ wait_queue_head_t gp_wq; /* Where GP task waits. */
+ int gp_flags; /* Commands for GP task. */
/* End of fields guarded by root rcu_node's lock. */
- raw_spinlock_t onofflock; /* exclude on/offline and */
+ raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp;
+ /* exclude on/offline and */
/* starting new GP. */
struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */
/* need a grace period. */
@@ -398,16 +394,11 @@ struct rcu_state {
struct rcu_head **orphan_donetail; /* Tail of above. */
long qlen_lazy; /* Number of lazy callbacks. */
long qlen; /* Total number of callbacks. */
- struct task_struct *rcu_barrier_in_progress;
- /* Task doing rcu_barrier(), */
- /* or NULL if no barrier. */
struct mutex barrier_mutex; /* Guards barrier fields. */
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
struct completion barrier_completion; /* Wake at barrier end. */
unsigned long n_barrier_done; /* ++ at start and end of */
/* _rcu_barrier(). */
- raw_spinlock_t fqslock; /* Only one task forcing */
- /* quiescent states. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
/* force_quiescent_state(). */
unsigned long n_force_qs; /* Number of calls to */
@@ -426,6 +417,10 @@ struct rcu_state {
struct list_head flavors; /* List of RCU flavors. */
};
+/* Values for rcu_state structure's gp_flags field. */
+#define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */
+#define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */
+
extern struct list_head rcu_struct_flavors;
#define for_each_rcu_flavor(rsp) \
list_for_each_entry((rsp), &rcu_struct_flavors, flavors)
@@ -468,7 +463,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
unsigned long flags);
-static void rcu_stop_cpu_kthread(int cpu);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static int rcu_print_task_stall(struct rcu_node *rnp);
@@ -491,15 +485,9 @@ static void invoke_rcu_callbacks_kthread(void);
static bool rcu_is_callbacks_kthread(void);
#ifdef CONFIG_RCU_BOOST
static void rcu_preempt_do_callbacks(void);
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
- cpumask_var_t cm);
static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp,
- int rnp_index);
-static void invoke_rcu_node_kthread(struct rcu_node *rnp);
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
+ struct rcu_node *rnp);
#endif /* #ifdef CONFIG_RCU_BOOST */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
static void __cpuinit rcu_prepare_kthreads(int cpu);
static void rcu_prepare_for_idle_init(int cpu);
static void rcu_cleanup_after_idle(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 7f3244c0df0..f9211548818 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -25,6 +25,8 @@
*/
#include <linux/delay.h>
+#include <linux/oom.h>
+#include <linux/smpboot.h>
#define RCU_KTHREAD_PRIO 1
@@ -118,7 +120,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
*/
void rcu_force_quiescent_state(void)
{
- force_quiescent_state(&rcu_preempt_state, 0);
+ force_quiescent_state(&rcu_preempt_state);
}
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
@@ -136,8 +138,6 @@ static void rcu_preempt_qs(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
- rdp->passed_quiesce_gpnum = rdp->gpnum;
- barrier();
if (rdp->passed_quiesce == 0)
trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
rdp->passed_quiesce = 1;
@@ -422,9 +422,11 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
unsigned long flags;
struct task_struct *t;
- if (!rcu_preempt_blocked_readers_cgp(rnp))
- return;
raw_spin_lock_irqsave(&rnp->lock, flags);
+ if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
t = list_entry(rnp->gp_tasks,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
@@ -584,17 +586,23 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
}
+ rnp->gp_tasks = NULL;
+ rnp->exp_tasks = NULL;
#ifdef CONFIG_RCU_BOOST
- /* In case root is being boosted and leaf is not. */
+ rnp->boost_tasks = NULL;
+ /*
+ * In case root is being boosted and leaf was not. Make sure
+ * that we boost the tasks blocking the current grace period
+ * in this case.
+ */
raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
if (rnp_root->boost_tasks != NULL &&
- rnp_root->boost_tasks != rnp_root->gp_tasks)
+ rnp_root->boost_tasks != rnp_root->gp_tasks &&
+ rnp_root->boost_tasks != rnp_root->exp_tasks)
rnp_root->boost_tasks = rnp_root->gp_tasks;
raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
#endif /* #ifdef CONFIG_RCU_BOOST */
- rnp->gp_tasks = NULL;
- rnp->exp_tasks = NULL;
return retval;
}
@@ -676,7 +684,7 @@ void synchronize_rcu(void)
EXPORT_SYMBOL_GPL(synchronize_rcu);
static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
-static long sync_rcu_preempt_exp_count;
+static unsigned long sync_rcu_preempt_exp_count;
static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
/*
@@ -791,7 +799,7 @@ void synchronize_rcu_expedited(void)
unsigned long flags;
struct rcu_node *rnp;
struct rcu_state *rsp = &rcu_preempt_state;
- long snap;
+ unsigned long snap;
int trycount = 0;
smp_mb(); /* Caller's modifications seen first by other CPUs. */
@@ -799,33 +807,47 @@ void synchronize_rcu_expedited(void)
smp_mb(); /* Above access cannot bleed into critical section. */
/*
+ * Block CPU-hotplug operations. This means that any CPU-hotplug
+ * operation that finds an rcu_node structure with tasks in the
+ * process of being boosted will know that all tasks blocking
+ * this expedited grace period will already be in the process of
+ * being boosted. This simplifies the process of moving tasks
+ * from leaf to root rcu_node structures.
+ */
+ get_online_cpus();
+
+ /*
* Acquire lock, falling back to synchronize_rcu() if too many
* lock-acquisition failures. Of course, if someone does the
* expedited grace period for us, just leave.
*/
while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
+ if (ULONG_CMP_LT(snap,
+ ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+ put_online_cpus();
+ goto mb_ret; /* Others did our work for us. */
+ }
if (trycount++ < 10) {
udelay(trycount * num_online_cpus());
} else {
+ put_online_cpus();
synchronize_rcu();
return;
}
- if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
- goto mb_ret; /* Others did our work for us. */
}
- if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
+ if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
+ put_online_cpus();
goto unlock_mb_ret; /* Others did our work for us. */
+ }
/* force all RCU readers onto ->blkd_tasks lists. */
synchronize_sched_expedited();
- raw_spin_lock_irqsave(&rsp->onofflock, flags);
-
/* Initialize ->expmask for all non-leaf rcu_node structures. */
rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ raw_spin_lock_irqsave(&rnp->lock, flags);
rnp->expmask = rnp->qsmaskinit;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
/* Snapshot current state of ->blkd_tasks lists. */
@@ -834,7 +856,7 @@ void synchronize_rcu_expedited(void)
if (NUM_RCU_NODES > 1)
sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
- raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
+ put_online_cpus();
/* Wait for snapshotted ->blkd_tasks lists to drain. */
rnp = rcu_get_root(rsp);
@@ -1069,6 +1091,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
#endif /* #else #ifdef CONFIG_RCU_TRACE */
+static void rcu_wake_cond(struct task_struct *t, int status)
+{
+ /*
+ * If the thread is yielding, only wake it when this
+ * is invoked from idle
+ */
+ if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
+ wake_up_process(t);
+}
+
/*
* Carry out RCU priority boosting on the task indicated by ->exp_tasks
* or ->boost_tasks, advancing the pointer to the next task in the
@@ -1141,17 +1173,6 @@ static int rcu_boost(struct rcu_node *rnp)
}
/*
- * Timer handler to initiate waking up of boost kthreads that
- * have yielded the CPU due to excessive numbers of tasks to
- * boost. We wake up the per-rcu_node kthread, which in turn
- * will wake up the booster kthread.
- */
-static void rcu_boost_kthread_timer(unsigned long arg)
-{
- invoke_rcu_node_kthread((struct rcu_node *)arg);
-}
-
-/*
* Priority-boosting kthread. One per leaf rcu_node and one for the
* root rcu_node.
*/
@@ -1174,8 +1195,9 @@ static int rcu_boost_kthread(void *arg)
else
spincnt = 0;
if (spincnt > 10) {
+ rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
trace_rcu_utilization("End boost kthread@rcu_yield");
- rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
+ schedule_timeout_interruptible(2);
trace_rcu_utilization("Start boost kthread@rcu_yield");
spincnt = 0;
}
@@ -1191,9 +1213,9 @@ static int rcu_boost_kthread(void *arg)
* kthread to start boosting them. If there is an expedited grace
* period in progress, it is always time to boost.
*
- * The caller must hold rnp->lock, which this function releases,
- * but irqs remain disabled. The ->boost_kthread_task is immortal,
- * so we don't need to worry about it going away.
+ * The caller must hold rnp->lock, which this function releases.
+ * The ->boost_kthread_task is immortal, so we don't need to worry
+ * about it going away.
*/
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
{
@@ -1213,8 +1235,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
rnp->boost_tasks = rnp->gp_tasks;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
t = rnp->boost_kthread_task;
- if (t != NULL)
- wake_up_process(t);
+ if (t)
+ rcu_wake_cond(t, rnp->boost_kthread_status);
} else {
rcu_initiate_boost_trace(rnp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1231,8 +1253,10 @@ static void invoke_rcu_callbacks_kthread(void)
local_irq_save(flags);
__this_cpu_write(rcu_cpu_has_work, 1);
if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
- current != __this_cpu_read(rcu_cpu_kthread_task))
- wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
+ current != __this_cpu_read(rcu_cpu_kthread_task)) {
+ rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
+ __this_cpu_read(rcu_cpu_kthread_status));
+ }
local_irq_restore(flags);
}
@@ -1245,21 +1269,6 @@ static bool rcu_is_callbacks_kthread(void)
return __get_cpu_var(rcu_cpu_kthread_task) == current;
}
-/*
- * Set the affinity of the boost kthread. The CPU-hotplug locks are
- * held, so no one should be messing with the existence of the boost
- * kthread.
- */
-static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
- cpumask_var_t cm)
-{
- struct task_struct *t;
-
- t = rnp->boost_kthread_task;
- if (t != NULL)
- set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
-}
-
#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
/*
@@ -1276,15 +1285,19 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
* Returns zero if all is well, a negated errno otherwise.
*/
static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp,
- int rnp_index)
+ struct rcu_node *rnp)
{
+ int rnp_index = rnp - &rsp->node[0];
unsigned long flags;
struct sched_param sp;
struct task_struct *t;
if (&rcu_preempt_state != rsp)
return 0;
+
+ if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
+ return 0;
+
rsp->boost = 1;
if (rnp->boost_kthread_task != NULL)
return 0;
@@ -1301,25 +1314,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
return 0;
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Stop the RCU's per-CPU kthread when its CPU goes offline,.
- */
-static void rcu_stop_cpu_kthread(int cpu)
-{
- struct task_struct *t;
-
- /* Stop the CPU's kthread. */
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (t != NULL) {
- per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
- kthread_stop(t);
- }
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
static void rcu_kthread_do_work(void)
{
rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data));
@@ -1327,112 +1321,22 @@ static void rcu_kthread_do_work(void)
rcu_preempt_do_callbacks();
}
-/*
- * Wake up the specified per-rcu_node-structure kthread.
- * Because the per-rcu_node kthreads are immortal, we don't need
- * to do anything to keep them alive.
- */
-static void invoke_rcu_node_kthread(struct rcu_node *rnp)
-{
- struct task_struct *t;
-
- t = rnp->node_kthread_task;
- if (t != NULL)
- wake_up_process(t);
-}
-
-/*
- * Set the specified CPU's kthread to run RT or not, as specified by
- * the to_rt argument. The CPU-hotplug locks are held, so the task
- * is not going away.
- */
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
+static void rcu_cpu_kthread_setup(unsigned int cpu)
{
- int policy;
struct sched_param sp;
- struct task_struct *t;
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (t == NULL)
- return;
- if (to_rt) {
- policy = SCHED_FIFO;
- sp.sched_priority = RCU_KTHREAD_PRIO;
- } else {
- policy = SCHED_NORMAL;
- sp.sched_priority = 0;
- }
- sched_setscheduler_nocheck(t, policy, &sp);
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
}
-/*
- * Timer handler to initiate the waking up of per-CPU kthreads that
- * have yielded the CPU due to excess numbers of RCU callbacks.
- * We wake up the per-rcu_node kthread, which in turn will wake up
- * the booster kthread.
- */
-static void rcu_cpu_kthread_timer(unsigned long arg)
+static void rcu_cpu_kthread_park(unsigned int cpu)
{
- struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
- struct rcu_node *rnp = rdp->mynode;
-
- atomic_or(rdp->grpmask, &rnp->wakemask);
- invoke_rcu_node_kthread(rnp);
+ per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
}
-/*
- * Drop to non-real-time priority and yield, but only after posting a
- * timer that will cause us to regain our real-time priority if we
- * remain preempted. Either way, we restore our real-time priority
- * before returning.
- */
-static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
+static int rcu_cpu_kthread_should_run(unsigned int cpu)
{
- struct sched_param sp;
- struct timer_list yield_timer;
- int prio = current->rt_priority;
-
- setup_timer_on_stack(&yield_timer, f, arg);
- mod_timer(&yield_timer, jiffies + 2);
- sp.sched_priority = 0;
- sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
- set_user_nice(current, 19);
- schedule();
- set_user_nice(current, 0);
- sp.sched_priority = prio;
- sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
- del_timer(&yield_timer);
-}
-
-/*
- * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
- * This can happen while the corresponding CPU is either coming online
- * or going offline. We cannot wait until the CPU is fully online
- * before starting the kthread, because the various notifier functions
- * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
- * the corresponding CPU is online.
- *
- * Return 1 if the kthread needs to stop, 0 otherwise.
- *
- * Caller must disable bh. This function can momentarily enable it.
- */
-static int rcu_cpu_kthread_should_stop(int cpu)
-{
- while (cpu_is_offline(cpu) ||
- !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
- smp_processor_id() != cpu) {
- if (kthread_should_stop())
- return 1;
- per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
- per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
- local_bh_enable();
- schedule_timeout_uninterruptible(1);
- if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
- set_cpus_allowed_ptr(current, cpumask_of(cpu));
- local_bh_disable();
- }
- per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
- return 0;
+ return __get_cpu_var(rcu_cpu_has_work);
}
/*
@@ -1440,138 +1344,35 @@ static int rcu_cpu_kthread_should_stop(int cpu)
* RCU softirq used in flavors and configurations of RCU that do not
* support RCU priority boosting.
*/
-static int rcu_cpu_kthread(void *arg)
+static void rcu_cpu_kthread(unsigned int cpu)
{
- int cpu = (int)(long)arg;
- unsigned long flags;
- int spincnt = 0;
- unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
- char work;
- char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+ unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status);
+ char work, *workp = &__get_cpu_var(rcu_cpu_has_work);
+ int spincnt;
- trace_rcu_utilization("Start CPU kthread@init");
- for (;;) {
- *statusp = RCU_KTHREAD_WAITING;
- trace_rcu_utilization("End CPU kthread@rcu_wait");
- rcu_wait(*workp != 0 || kthread_should_stop());
+ for (spincnt = 0; spincnt < 10; spincnt++) {
trace_rcu_utilization("Start CPU kthread@rcu_wait");
local_bh_disable();
- if (rcu_cpu_kthread_should_stop(cpu)) {
- local_bh_enable();
- break;
- }
*statusp = RCU_KTHREAD_RUNNING;
- per_cpu(rcu_cpu_kthread_loops, cpu)++;
- local_irq_save(flags);
+ this_cpu_inc(rcu_cpu_kthread_loops);
+ local_irq_disable();
work = *workp;
*workp = 0;
- local_irq_restore(flags);
+ local_irq_enable();
if (work)
rcu_kthread_do_work();
local_bh_enable();
- if (*workp != 0)
- spincnt++;
- else
- spincnt = 0;
- if (spincnt > 10) {
- *statusp = RCU_KTHREAD_YIELDING;
- trace_rcu_utilization("End CPU kthread@rcu_yield");
- rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
- trace_rcu_utilization("Start CPU kthread@rcu_yield");
- spincnt = 0;
- }
- }
- *statusp = RCU_KTHREAD_STOPPED;
- trace_rcu_utilization("End CPU kthread@term");
- return 0;
-}
-
-/*
- * Spawn a per-CPU kthread, setting up affinity and priority.
- * Because the CPU hotplug lock is held, no other CPU will be attempting
- * to manipulate rcu_cpu_kthread_task. There might be another CPU
- * attempting to access it during boot, but the locking in kthread_bind()
- * will enforce sufficient ordering.
- *
- * Please note that we cannot simply refuse to wake up the per-CPU
- * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state,
- * which can result in softlockup complaints if the task ends up being
- * idle for more than a couple of minutes.
- *
- * However, please note also that we cannot bind the per-CPU kthread to its
- * CPU until that CPU is fully online. We also cannot wait until the
- * CPU is fully online before we create its per-CPU kthread, as this would
- * deadlock the system when CPU notifiers tried waiting for grace
- * periods. So we bind the per-CPU kthread to its CPU only if the CPU
- * is online. If its CPU is not yet fully online, then the code in
- * rcu_cpu_kthread() will wait until it is fully online, and then do
- * the binding.
- */
-static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
-{
- struct sched_param sp;
- struct task_struct *t;
-
- if (!rcu_scheduler_fully_active ||
- per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
- return 0;
- t = kthread_create_on_node(rcu_cpu_kthread,
- (void *)(long)cpu,
- cpu_to_node(cpu),
- "rcuc/%d", cpu);
- if (IS_ERR(t))
- return PTR_ERR(t);
- if (cpu_online(cpu))
- kthread_bind(t, cpu);
- per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
- WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
- sp.sched_priority = RCU_KTHREAD_PRIO;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- per_cpu(rcu_cpu_kthread_task, cpu) = t;
- wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */
- return 0;
-}
-
-/*
- * Per-rcu_node kthread, which is in charge of waking up the per-CPU
- * kthreads when needed. We ignore requests to wake up kthreads
- * for offline CPUs, which is OK because force_quiescent_state()
- * takes care of this case.
- */
-static int rcu_node_kthread(void *arg)
-{
- int cpu;
- unsigned long flags;
- unsigned long mask;
- struct rcu_node *rnp = (struct rcu_node *)arg;
- struct sched_param sp;
- struct task_struct *t;
-
- for (;;) {
- rnp->node_kthread_status = RCU_KTHREAD_WAITING;
- rcu_wait(atomic_read(&rnp->wakemask) != 0);
- rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
- raw_spin_lock_irqsave(&rnp->lock, flags);
- mask = atomic_xchg(&rnp->wakemask, 0);
- rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
- for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
- if ((mask & 0x1) == 0)
- continue;
- preempt_disable();
- t = per_cpu(rcu_cpu_kthread_task, cpu);
- if (!cpu_online(cpu) || t == NULL) {
- preempt_enable();
- continue;
- }
- per_cpu(rcu_cpu_has_work, cpu) = 1;
- sp.sched_priority = RCU_KTHREAD_PRIO;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- preempt_enable();
+ if (*workp == 0) {
+ trace_rcu_utilization("End CPU kthread@rcu_wait");
+ *statusp = RCU_KTHREAD_WAITING;
+ return;
}
}
- /* NOTREACHED */
- rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
- return 0;
+ *statusp = RCU_KTHREAD_YIELDING;
+ trace_rcu_utilization("Start CPU kthread@rcu_yield");
+ schedule_timeout_interruptible(2);
+ trace_rcu_utilization("End CPU kthread@rcu_yield");
+ *statusp = RCU_KTHREAD_WAITING;
}
/*
@@ -1583,17 +1384,17 @@ static int rcu_node_kthread(void *arg)
* no outgoing CPU. If there are no CPUs left in the affinity set,
* this function allows the kthread to execute on any CPU.
*/
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
+ struct task_struct *t = rnp->boost_kthread_task;
+ unsigned long mask = rnp->qsmaskinit;
cpumask_var_t cm;
int cpu;
- unsigned long mask = rnp->qsmaskinit;
- if (rnp->node_kthread_task == NULL)
+ if (!t)
return;
- if (!alloc_cpumask_var(&cm, GFP_KERNEL))
+ if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
return;
- cpumask_clear(cm);
for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
if ((mask & 0x1) && cpu != outgoingcpu)
cpumask_set_cpu(cpu, cm);
@@ -1603,62 +1404,36 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
cpumask_clear_cpu(cpu, cm);
WARN_ON_ONCE(cpumask_weight(cm) == 0);
}
- set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
- rcu_boost_kthread_setaffinity(rnp, cm);
+ set_cpus_allowed_ptr(t, cm);
free_cpumask_var(cm);
}
-/*
- * Spawn a per-rcu_node kthread, setting priority and affinity.
- * Called during boot before online/offline can happen, or, if
- * during runtime, with the main CPU-hotplug locks held. So only
- * one of these can be executing at a time.
- */
-static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
- struct rcu_node *rnp)
-{
- unsigned long flags;
- int rnp_index = rnp - &rsp->node[0];
- struct sched_param sp;
- struct task_struct *t;
-
- if (!rcu_scheduler_fully_active ||
- rnp->qsmaskinit == 0)
- return 0;
- if (rnp->node_kthread_task == NULL) {
- t = kthread_create(rcu_node_kthread, (void *)rnp,
- "rcun/%d", rnp_index);
- if (IS_ERR(t))
- return PTR_ERR(t);
- raw_spin_lock_irqsave(&rnp->lock, flags);
- rnp->node_kthread_task = t;
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- sp.sched_priority = 99;
- sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
- wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
- }
- return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
-}
+static struct smp_hotplug_thread rcu_cpu_thread_spec = {
+ .store = &rcu_cpu_kthread_task,
+ .thread_should_run = rcu_cpu_kthread_should_run,
+ .thread_fn = rcu_cpu_kthread,
+ .thread_comm = "rcuc/%u",
+ .setup = rcu_cpu_kthread_setup,
+ .park = rcu_cpu_kthread_park,
+};
/*
* Spawn all kthreads -- called as soon as the scheduler is running.
*/
static int __init rcu_spawn_kthreads(void)
{
- int cpu;
struct rcu_node *rnp;
+ int cpu;
rcu_scheduler_fully_active = 1;
- for_each_possible_cpu(cpu) {
+ for_each_possible_cpu(cpu)
per_cpu(rcu_cpu_has_work, cpu) = 0;
- if (cpu_online(cpu))
- (void)rcu_spawn_one_cpu_kthread(cpu);
- }
+ BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
rnp = rcu_get_root(rcu_state);
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+ (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
if (NUM_RCU_NODES > 1) {
rcu_for_each_leaf_node(rcu_state, rnp)
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+ (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
}
return 0;
}
@@ -1670,11 +1445,8 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
struct rcu_node *rnp = rdp->mynode;
/* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
- if (rcu_scheduler_fully_active) {
- (void)rcu_spawn_one_cpu_kthread(cpu);
- if (rnp->node_kthread_task == NULL)
- (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
- }
+ if (rcu_scheduler_fully_active)
+ (void)rcu_spawn_one_boost_kthread(rcu_state, rnp);
}
#else /* #ifdef CONFIG_RCU_BOOST */
@@ -1698,19 +1470,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
-#ifdef CONFIG_HOTPLUG_CPU
-
-static void rcu_stop_cpu_kthread(int cpu)
-{
-}
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
-{
-}
-
-static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
{
}
@@ -1997,6 +1757,26 @@ static void rcu_prepare_for_idle(int cpu)
if (!tne)
return;
+ /* Adaptive-tick mode, where usermode execution is idle to RCU. */
+ if (!is_idle_task(current)) {
+ rdtp->dyntick_holdoff = jiffies - 1;
+ if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
+ trace_rcu_prep_idle("User dyntick with callbacks");
+ rdtp->idle_gp_timer_expires =
+ round_up(jiffies + RCU_IDLE_GP_DELAY,
+ RCU_IDLE_GP_DELAY);
+ } else if (rcu_cpu_has_callbacks(cpu)) {
+ rdtp->idle_gp_timer_expires =
+ round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
+ trace_rcu_prep_idle("User dyntick with lazy callbacks");
+ } else {
+ return;
+ }
+ tp = &rdtp->idle_gp_timer;
+ mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
+ return;
+ }
+
/*
* If this is an idle re-entry, for example, due to use of
* RCU_NONIDLE() or the new idle-loop tracing API within the idle
@@ -2075,16 +1855,16 @@ static void rcu_prepare_for_idle(int cpu)
#ifdef CONFIG_TREE_PREEMPT_RCU
if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
rcu_preempt_qs(cpu);
- force_quiescent_state(&rcu_preempt_state, 0);
+ force_quiescent_state(&rcu_preempt_state);
}
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
if (per_cpu(rcu_sched_data, cpu).nxtlist) {
rcu_sched_qs(cpu);
- force_quiescent_state(&rcu_sched_state, 0);
+ force_quiescent_state(&rcu_sched_state);
}
if (per_cpu(rcu_bh_data, cpu).nxtlist) {
rcu_bh_qs(cpu);
- force_quiescent_state(&rcu_bh_state, 0);
+ force_quiescent_state(&rcu_bh_state);
}
/*
@@ -2112,6 +1892,88 @@ static void rcu_idle_count_callbacks_posted(void)
__this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
}
+/*
+ * Data for flushing lazy RCU callbacks at OOM time.
+ */
+static atomic_t oom_callback_count;
+static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
+
+/*
+ * RCU OOM callback -- decrement the outstanding count and deliver the
+ * wake-up if we are the last one.
+ */
+static void rcu_oom_callback(struct rcu_head *rhp)
+{
+ if (atomic_dec_and_test(&oom_callback_count))
+ wake_up(&oom_callback_wq);
+}
+
+/*
+ * Post an rcu_oom_notify callback on the current CPU if it has at
+ * least one lazy callback. This will unnecessarily post callbacks
+ * to CPUs that already have a non-lazy callback at the end of their
+ * callback list, but this is an infrequent operation, so accept some
+ * extra overhead to keep things simple.
+ */
+static void rcu_oom_notify_cpu(void *unused)
+{
+ struct rcu_state *rsp;
+ struct rcu_data *rdp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = __this_cpu_ptr(rsp->rda);
+ if (rdp->qlen_lazy != 0) {
+ atomic_inc(&oom_callback_count);
+ rsp->call(&rdp->oom_head, rcu_oom_callback);
+ }
+ }
+}
+
+/*
+ * If low on memory, ensure that each CPU has a non-lazy callback.
+ * This will wake up CPUs that have only lazy callbacks, in turn
+ * ensuring that they free up the corresponding memory in a timely manner.
+ * Because an uncertain amount of memory will be freed in some uncertain
+ * timeframe, we do not claim to have freed anything.
+ */
+static int rcu_oom_notify(struct notifier_block *self,
+ unsigned long notused, void *nfreed)
+{
+ int cpu;
+
+ /* Wait for callbacks from earlier instance to complete. */
+ wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
+
+ /*
+ * Prevent premature wakeup: ensure that all increments happen
+ * before there is a chance of the counter reaching zero.
+ */
+ atomic_set(&oom_callback_count, 1);
+
+ get_online_cpus();
+ for_each_online_cpu(cpu) {
+ smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
+ cond_resched();
+ }
+ put_online_cpus();
+
+ /* Unconditionally decrement: no need to wake ourselves up. */
+ atomic_dec(&oom_callback_count);
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block rcu_oom_nb = {
+ .notifier_call = rcu_oom_notify
+};
+
+static int __init rcu_register_oom_notifier(void)
+{
+ register_oom_notifier(&rcu_oom_nb);
+ return 0;
+}
+early_initcall(rcu_register_oom_notifier);
+
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
#ifdef CONFIG_RCU_CPU_STALL_INFO
@@ -2122,11 +1984,15 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
struct timer_list *tltp = &rdtp->idle_gp_timer;
+ char c;
- sprintf(cp, "drain=%d %c timer=%lu",
- rdtp->dyntick_drain,
- rdtp->dyntick_holdoff == jiffies ? 'H' : '.',
- timer_pending(tltp) ? tltp->expires - jiffies : -1);
+ c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
+ if (timer_pending(tltp))
+ sprintf(cp, "drain=%d %c timer=%lu",
+ rdtp->dyntick_drain, c, tltp->expires - jiffies);
+ else
+ sprintf(cp, "drain=%d %c timer not pending",
+ rdtp->dyntick_drain, c);
}
#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -2194,11 +2060,10 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp)
/* Increment ->ticks_this_gp for all flavors of RCU. */
static void increment_cpu_stall_ticks(void)
{
- __get_cpu_var(rcu_sched_data).ticks_this_gp++;
- __get_cpu_var(rcu_bh_data).ticks_this_gp++;
-#ifdef CONFIG_TREE_PREEMPT_RCU
- __get_cpu_var(rcu_preempt_data).ticks_this_gp++;
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ __this_cpu_ptr(rsp->rda)->ticks_this_gp++;
}
#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index abffb486e94..693513bc50e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -51,8 +51,8 @@ static int show_rcubarrier(struct seq_file *m, void *unused)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- seq_printf(m, "%s: %c bcc: %d nbd: %lu\n",
- rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.',
+ seq_printf(m, "%s: bcc: %d nbd: %lu\n",
+ rsp->name,
atomic_read(&rsp->barrier_cpu_count),
rsp->n_barrier_done);
return 0;
@@ -86,12 +86,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
{
if (!rdp->beenonline)
return;
- seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d",
+ seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d",
rdp->cpu,
cpu_is_offline(rdp->cpu) ? '!' : ' ',
rdp->completed, rdp->gpnum,
- rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
- rdp->qs_pending);
+ rdp->passed_quiesce, rdp->qs_pending);
seq_printf(m, " dt=%d/%llx/%d df=%lu",
atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
@@ -108,11 +107,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->nxttail[RCU_WAIT_TAIL]],
".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]);
#ifdef CONFIG_RCU_BOOST
- seq_printf(m, " kt=%d/%c/%d ktl=%x",
+ seq_printf(m, " kt=%d/%c ktl=%x",
per_cpu(rcu_cpu_has_work, rdp->cpu),
convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
rdp->cpu)),
- per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff);
#endif /* #ifdef CONFIG_RCU_BOOST */
seq_printf(m, " b=%ld", rdp->blimit);
@@ -150,12 +148,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
{
if (!rdp->beenonline)
return;
- seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
+ seq_printf(m, "%d,%s,%lu,%lu,%d,%d",
rdp->cpu,
cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
rdp->completed, rdp->gpnum,
- rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
- rdp->qs_pending);
+ rdp->passed_quiesce, rdp->qs_pending);
seq_printf(m, ",%d,%llx,%d,%lu",
atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
@@ -186,7 +183,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
int cpu;
struct rcu_state *rsp;
- seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
+ seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\",");
seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\"");
#ifdef CONFIG_RCU_BOOST
@@ -386,10 +383,9 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
rdp->n_rp_report_qs,
rdp->n_rp_cb_ready,
rdp->n_rp_cpu_needs_gp);
- seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+ seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n",
rdp->n_rp_gp_completed,
rdp->n_rp_gp_started,
- rdp->n_rp_need_fqs,
rdp->n_rp_need_nothing);
}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 173ea52f3af..f06d249e103 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
+obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
obj-$(CONFIG_SMP) += cpupri.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d325c4b2dcb..c1774723643 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -740,126 +740,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
dequeue_task(rq, p, flags);
}
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-
-/*
- * There are no locks covering percpu hardirq/softirq time.
- * They are only modified in account_system_vtime, on corresponding CPU
- * with interrupts disabled. So, writes are safe.
- * They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
- * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
- */
-static DEFINE_PER_CPU(u64, cpu_hardirq_time);
-static DEFINE_PER_CPU(u64, cpu_softirq_time);
-
-static DEFINE_PER_CPU(u64, irq_start_time);
-static int sched_clock_irqtime;
-
-void enable_sched_clock_irqtime(void)
-{
- sched_clock_irqtime = 1;
-}
-
-void disable_sched_clock_irqtime(void)
-{
- sched_clock_irqtime = 0;
-}
-
-#ifndef CONFIG_64BIT
-static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-
-static inline void irq_time_write_begin(void)
-{
- __this_cpu_inc(irq_time_seq.sequence);
- smp_wmb();
-}
-
-static inline void irq_time_write_end(void)
-{
- smp_wmb();
- __this_cpu_inc(irq_time_seq.sequence);
-}
-
-static inline u64 irq_time_read(int cpu)
-{
- u64 irq_time;
- unsigned seq;
-
- do {
- seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
- irq_time = per_cpu(cpu_softirq_time, cpu) +
- per_cpu(cpu_hardirq_time, cpu);
- } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
-
- return irq_time;
-}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
-
-static inline void irq_time_write_end(void)
-{
-}
-
-static inline u64 irq_time_read(int cpu)
-{
- return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
-}
-#endif /* CONFIG_64BIT */
-
-/*
- * Called before incrementing preempt_count on {soft,}irq_enter
- * and before decrementing preempt_count on {soft,}irq_exit.
- */
-void account_system_vtime(struct task_struct *curr)
-{
- unsigned long flags;
- s64 delta;
- int cpu;
-
- if (!sched_clock_irqtime)
- return;
-
- local_irq_save(flags);
-
- cpu = smp_processor_id();
- delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
- __this_cpu_add(irq_start_time, delta);
-
- irq_time_write_begin();
- /*
- * We do not account for softirq time from ksoftirqd here.
- * We want to continue accounting softirq time to ksoftirqd thread
- * in that case, so as not to confuse scheduler with a special task
- * that do not consume any time, but still wants to run.
- */
- if (hardirq_count())
- __this_cpu_add(cpu_hardirq_time, delta);
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
- __this_cpu_add(cpu_softirq_time, delta);
-
- irq_time_write_end();
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(account_system_vtime);
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#ifdef CONFIG_PARAVIRT
-static inline u64 steal_ticks(u64 steal)
-{
- if (unlikely(steal > NSEC_PER_SEC))
- return div_u64(steal, TICK_NSEC);
-
- return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
-}
-#endif
-
static void update_rq_clock_task(struct rq *rq, s64 delta)
{
/*
@@ -920,43 +800,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#endif
}
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-static int irqtime_account_hi_update(void)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- unsigned long flags;
- u64 latest_ns;
- int ret = 0;
-
- local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_hardirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
- ret = 1;
- local_irq_restore(flags);
- return ret;
-}
-
-static int irqtime_account_si_update(void)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- unsigned long flags;
- u64 latest_ns;
- int ret = 0;
-
- local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_softirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
- ret = 1;
- local_irq_restore(flags);
- return ret;
-}
-
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#define sched_clock_irqtime (0)
-
-#endif
-
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -1518,25 +1361,6 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
smp_send_reschedule(cpu);
}
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
-{
- struct rq *rq;
- int ret = 0;
-
- rq = __task_rq_lock(p);
- if (p->on_cpu) {
- ttwu_activate(rq, p, ENQUEUE_WAKEUP);
- ttwu_do_wakeup(rq, p, wake_flags);
- ret = 1;
- }
- __task_rq_unlock(rq);
-
- return ret;
-
-}
-#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
-
bool cpus_share_cache(int this_cpu, int that_cpu)
{
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
@@ -1597,21 +1421,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* If the owning (remote) cpu is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*/
- while (p->on_cpu) {
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- /*
- * In case the architecture enables interrupts in
- * context_switch(), we cannot busy wait, since that
- * would lead to deadlocks when an interrupt hits and
- * tries to wake up @prev. So bail and do a complete
- * remote wakeup.
- */
- if (ttwu_activate_remote(p, wake_flags))
- goto stat;
-#else
+ while (p->on_cpu)
cpu_relax();
-#endif
- }
/*
* Pairs with the smp_wmb() in finish_lock_switch().
*/
@@ -1953,14 +1764,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
* Manfred Spraul <manfred@colorfullife.com>
*/
prev_state = prev->state;
+ vtime_task_switch(prev);
finish_arch_switch(prev);
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- local_irq_disable();
-#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
perf_event_task_sched_in(prev, current);
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- local_irq_enable();
-#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
finish_lock_switch(rq, prev);
finish_arch_post_lock_switch();
@@ -2081,6 +1887,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
#endif
/* Here we just switch the register state and the stack. */
+ rcu_switch(prev, next);
switch_to(prev, next, prev);
barrier();
@@ -2809,398 +2616,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
-#ifdef CONFIG_CGROUP_CPUACCT
-struct cgroup_subsys cpuacct_subsys;
-struct cpuacct root_cpuacct;
-#endif
-
-static inline void task_group_account_field(struct task_struct *p, int index,
- u64 tmp)
-{
-#ifdef CONFIG_CGROUP_CPUACCT
- struct kernel_cpustat *kcpustat;
- struct cpuacct *ca;
-#endif
- /*
- * Since all updates are sure to touch the root cgroup, we
- * get ourselves ahead and touch it first. If the root cgroup
- * is the only cgroup, then nothing else should be necessary.
- *
- */
- __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
-
-#ifdef CONFIG_CGROUP_CPUACCT
- if (unlikely(!cpuacct_subsys.active))
- return;
-
- rcu_read_lock();
- ca = task_ca(p);
- while (ca && (ca != &root_cpuacct)) {
- kcpustat = this_cpu_ptr(ca->cpustat);
- kcpustat->cpustat[index] += tmp;
- ca = parent_ca(ca);
- }
- rcu_read_unlock();
-#endif
-}
-
-
-/*
- * Account user cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-void account_user_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
-{
- int index;
-
- /* Add user time to process. */
- p->utime += cputime;
- p->utimescaled += cputime_scaled;
- account_group_user_time(p, cputime);
-
- index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
-
- /* Add user time to cpustat. */
- task_group_account_field(p, index, (__force u64) cputime);
-
- /* Account for user time used */
- acct_update_integrals(p);
-}
-
-/*
- * Account guest cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in virtual machine since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-static void account_guest_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
-
- /* Add guest time to process. */
- p->utime += cputime;
- p->utimescaled += cputime_scaled;
- account_group_user_time(p, cputime);
- p->gtime += cputime;
-
- /* Add guest time to cpustat. */
- if (TASK_NICE(p) > 0) {
- cpustat[CPUTIME_NICE] += (__force u64) cputime;
- cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
- } else {
- cpustat[CPUTIME_USER] += (__force u64) cputime;
- cpustat[CPUTIME_GUEST] += (__force u64) cputime;
- }
-}
-
-/*
- * Account system cpu time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
- */
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled, int index)
-{
- /* Add system time to process. */
- p->stime += cputime;
- p->stimescaled += cputime_scaled;
- account_group_system_time(p, cputime);
-
- /* Add system time to cpustat. */
- task_group_account_field(p, index, (__force u64) cputime);
-
- /* Account for system time used */
- acct_update_integrals(p);
-}
-
-/*
- * Account system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- */
-void account_system_time(struct task_struct *p, int hardirq_offset,
- cputime_t cputime, cputime_t cputime_scaled)
-{
- int index;
-
- if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
- account_guest_time(p, cputime, cputime_scaled);
- return;
- }
-
- if (hardirq_count() - hardirq_offset)
- index = CPUTIME_IRQ;
- else if (in_serving_softirq())
- index = CPUTIME_SOFTIRQ;
- else
- index = CPUTIME_SYSTEM;
-
- __account_system_time(p, cputime, cputime_scaled, index);
-}
-
-/*
- * Account for involuntary wait time.
- * @cputime: the cpu time spent in involuntary wait
- */
-void account_steal_time(cputime_t cputime)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
-
- cpustat[CPUTIME_STEAL] += (__force u64) cputime;
-}
-
-/*
- * Account for idle time.
- * @cputime: the cpu time spent in idle wait
- */
-void account_idle_time(cputime_t cputime)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- struct rq *rq = this_rq();
-
- if (atomic_read(&rq->nr_iowait) > 0)
- cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
- else
- cpustat[CPUTIME_IDLE] += (__force u64) cputime;
-}
-
-static __always_inline bool steal_account_process_tick(void)
-{
-#ifdef CONFIG_PARAVIRT
- if (static_key_false(&paravirt_steal_enabled)) {
- u64 steal, st = 0;
-
- steal = paravirt_steal_clock(smp_processor_id());
- steal -= this_rq()->prev_steal_time;
-
- st = steal_ticks(steal);
- this_rq()->prev_steal_time += st * TICK_NSEC;
-
- account_steal_time(st);
- return st;
- }
-#endif
- return false;
-}
-
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-/*
- * Account a tick to a process and cpustat
- * @p: the process that the cpu time gets accounted to
- * @user_tick: is the tick from userspace
- * @rq: the pointer to rq
- *
- * Tick demultiplexing follows the order
- * - pending hardirq update
- * - pending softirq update
- * - user_time
- * - idle_time
- * - system time
- * - check for guest_time
- * - else account as system_time
- *
- * Check for hardirq is done both for system and user time as there is
- * no timer going off while we are on hardirq and hence we may never get an
- * opportunity to update it solely in system time.
- * p->stime and friends are only updated on system time and not on irq
- * softirq as those do not count in task exec_runtime any more.
- */
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq)
-{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
- u64 *cpustat = kcpustat_this_cpu->cpustat;
-
- if (steal_account_process_tick())
- return;
-
- if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
- } else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
- } else if (this_cpu_ksoftirqd() == p) {
- /*
- * ksoftirqd time do not get accounted in cpu_softirq_time.
- * So, we have to handle it separately here.
- * Also, p->stime needs to be updated for ksoftirqd.
- */
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SOFTIRQ);
- } else if (user_tick) {
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
- } else if (p == rq->idle) {
- account_idle_time(cputime_one_jiffy);
- } else if (p->flags & PF_VCPU) { /* System time or guest time */
- account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
- } else {
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SYSTEM);
- }
-}
-
-static void irqtime_account_idle_ticks(int ticks)
-{
- int i;
- struct rq *rq = this_rq();
-
- for (i = 0; i < ticks; i++)
- irqtime_account_process_tick(current, 0, rq);
-}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static void irqtime_account_idle_ticks(int ticks) {}
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq) {}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-/*
- * Account a single tick of cpu time.
- * @p: the process that the cpu time gets accounted to
- * @user_tick: indicates if the tick is a user or a system tick
- */
-void account_process_tick(struct task_struct *p, int user_tick)
-{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
- struct rq *rq = this_rq();
-
- if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq);
- return;
- }
-
- if (steal_account_process_tick())
- return;
-
- if (user_tick)
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
- else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
- account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
- one_jiffy_scaled);
- else
- account_idle_time(cputime_one_jiffy);
-}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
- account_steal_time(jiffies_to_cputime(ticks));
-}
-
-/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
-{
-
- if (sched_clock_irqtime) {
- irqtime_account_idle_ticks(ticks);
- return;
- }
-
- account_idle_time(jiffies_to_cputime(ticks));
-}
-
-#endif
-
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- *ut = p->utime;
- *st = p->stime;
-}
-
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct task_cputime cputime;
-
- thread_group_cputime(p, &cputime);
-
- *ut = cputime.utime;
- *st = cputime.stime;
-}
-#else
-
-#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
-#endif
-
-void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- cputime_t rtime, utime = p->utime, total = utime + p->stime;
-
- /*
- * Use CFS's precise accounting:
- */
- rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
-
- if (total) {
- u64 temp = (__force u64) rtime;
-
- temp *= (__force u64) utime;
- do_div(temp, (__force u32) total);
- utime = (__force cputime_t) temp;
- } else
- utime = rtime;
-
- /*
- * Compare with previous values, to keep monotonicity:
- */
- p->prev_utime = max(p->prev_utime, utime);
- p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
-
- *ut = p->prev_utime;
- *st = p->prev_stime;
-}
-
-/*
- * Must be called with siglock held.
- */
-void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct signal_struct *sig = p->signal;
- struct task_cputime cputime;
- cputime_t rtime, utime, total;
-
- thread_group_cputime(p, &cputime);
-
- total = cputime.utime + cputime.stime;
- rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
-
- if (total) {
- u64 temp = (__force u64) rtime;
-
- temp *= (__force u64) cputime.utime;
- do_div(temp, (__force u32) total);
- utime = (__force cputime_t) temp;
- } else
- utime = rtime;
-
- sig->prev_utime = max(sig->prev_utime, utime);
- sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
-
- *ut = sig->prev_utime;
- *st = sig->prev_stime;
-}
-#endif
-
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -3361,6 +2776,40 @@ pick_next_task(struct rq *rq)
/*
* __schedule() is the main scheduler function.
+ *
+ * The main means of driving the scheduler and thus entering this function are:
+ *
+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
+ *
+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
+ * paths. For example, see arch/x86/entry_64.S.
+ *
+ * To drive preemption between tasks, the scheduler sets the flag in timer
+ * interrupt handler scheduler_tick().
+ *
+ * 3. Wakeups don't really cause entry into schedule(). They add a
+ * task to the run-queue and that's it.
+ *
+ * Now, if the new task added to the run-queue preempts the current
+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
+ * called on the nearest possible occasion:
+ *
+ * - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ *
+ * - in syscall or exception context, at the next outmost
+ * preempt_enable(). (this might be as soon as the wake_up()'s
+ * spin_unlock()!)
+ *
+ * - in IRQ context, return from interrupt-handler to
+ * preemptible context
+ *
+ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ * then at the next:
+ *
+ * - cond_resched() call
+ * - explicit schedule() call
+ * - return from syscall or exception to user-space
+ * - return from interrupt-handler to user-space
*/
static void __sched __schedule(void)
{
@@ -3462,6 +2911,21 @@ asmlinkage void __sched schedule(void)
}
EXPORT_SYMBOL(schedule);
+#ifdef CONFIG_RCU_USER_QS
+asmlinkage void __sched schedule_user(void)
+{
+ /*
+ * If we come here after a random call to set_need_resched(),
+ * or we have been woken up remotely but the IPI has not yet arrived,
+ * we haven't yet exited the RCU idle mode. Do it here manually until
+ * we find a better solution.
+ */
+ rcu_user_exit();
+ schedule();
+ rcu_user_enter();
+}
+#endif
+
/**
* schedule_preempt_disabled - called with preemption disabled
*
@@ -3563,6 +3027,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
/* Catch callers which need to be fixed */
BUG_ON(ti->preempt_count || !irqs_disabled());
+ rcu_user_exit();
do {
add_preempt_count(PREEMPT_ACTIVE);
local_irq_enable();
@@ -4340,9 +3805,7 @@ recheck:
*/
if (unlikely(policy == p->policy && (!rt_policy(policy) ||
param->sched_priority == p->rt_priority))) {
-
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
return 0;
}
@@ -4864,13 +4327,6 @@ again:
*/
if (preempt && rq != p_rq)
resched_task(p_rq->curr);
- } else {
- /*
- * We might have set it in task_yield_fair(), but are
- * not going to schedule(), so don't want to skip
- * the next update.
- */
- rq->skip_clock_update = 0;
}
out:
@@ -5300,27 +4756,17 @@ void idle_task_exit(void)
}
/*
- * While a dead CPU has no uninterruptible tasks queued at this point,
- * it might still have a nonzero ->nr_uninterruptible counter, because
- * for performance reasons the counter is not stricly tracking tasks to
- * their home CPUs. So we just add the counter to another CPU's counter,
- * to keep the global sum constant after CPU-down:
- */
-static void migrate_nr_uninterruptible(struct rq *rq_src)
-{
- struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
-
- rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
- rq_src->nr_uninterruptible = 0;
-}
-
-/*
- * remove the tasks which were accounted by rq from calc_load_tasks.
+ * Since this CPU is going 'away' for a while, fold any nr_active delta
+ * we might have. Assumes we're called after migrate_tasks() so that the
+ * nr_active count is stable.
+ *
+ * Also see the comment "Global load-average calculations".
*/
-static void calc_global_load_remove(struct rq *rq)
+static void calc_load_migrate(struct rq *rq)
{
- atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
- rq->calc_load_active = 0;
+ long delta = calc_load_fold_active(rq);
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
}
/*
@@ -5348,9 +4794,6 @@ static void migrate_tasks(unsigned int dead_cpu)
*/
rq->stop = NULL;
- /* Ensure any throttled groups are reachable by pick_next_task */
- unthrottle_offline_cfs_rqs(rq);
-
for ( ; ; ) {
/*
* There's this thread running, bail when that's the only
@@ -5425,16 +4868,25 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
*tablep = NULL;
}
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX;
+
static void
set_table_entry(struct ctl_table *entry,
const char *procname, void *data, int maxlen,
- umode_t mode, proc_handler *proc_handler)
+ umode_t mode, proc_handler *proc_handler,
+ bool load_idx)
{
entry->procname = procname;
entry->data = data;
entry->maxlen = maxlen;
entry->mode = mode;
entry->proc_handler = proc_handler;
+
+ if (load_idx) {
+ entry->extra1 = &min_load_idx;
+ entry->extra2 = &max_load_idx;
+ }
}
static struct ctl_table *
@@ -5446,30 +4898,30 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
return NULL;
set_table_entry(&table[0], "min_interval", &sd->min_interval,
- sizeof(long), 0644, proc_doulongvec_minmax);
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[1], "max_interval", &sd->max_interval,
- sizeof(long), 0644, proc_doulongvec_minmax);
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[9], "cache_nice_tries",
&sd->cache_nice_tries,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[10], "flags", &sd->flags,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[11], "name", sd->name,
- CORENAME_MAX_SIZE, 0444, proc_dostring);
+ CORENAME_MAX_SIZE, 0444, proc_dostring, false);
/* &table[12] is terminator */
return table;
@@ -5613,9 +5065,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
migrate_tasks(cpu);
BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
+ break;
- migrate_nr_uninterruptible(rq);
- calc_global_load_remove(rq);
+ case CPU_DEAD:
+ calc_load_migrate(rq);
break;
#endif
}
@@ -6024,11 +5477,6 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
* allows us to avoid some pointer chasing select_idle_sibling().
*
- * Iterate domains and sched_groups downward, assigning CPUs to be
- * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
- * due to random perturbation self canceling, ie sw buddies pull
- * their counterpart to their CPU's hw counterpart.
- *
* Also keep a unique ID per domain (we use the first cpu number in
* the cpumask of the domain), this allows us to quickly tell if
* two cpus are in the same cache domain, see cpus_share_cache().
@@ -6042,40 +5490,8 @@ static void update_top_cache_domain(int cpu)
int id = cpu;
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd) {
- struct sched_domain *tmp = sd;
- struct sched_group *sg, *prev;
- bool right;
-
- /*
- * Traverse to first CPU in group, and count hops
- * to cpu from there, switching direction on each
- * hop, never ever pointing the last CPU rightward.
- */
- do {
- id = cpumask_first(sched_domain_span(tmp));
- prev = sg = tmp->groups;
- right = 1;
-
- while (cpumask_first(sched_group_cpus(sg)) != id)
- sg = sg->next;
-
- while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
- prev = sg;
- sg = sg->next;
- right = !right;
- }
-
- /* A CPU went down, never point back to domain start. */
- if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
- right = false;
-
- sg = right ? sg->next : prev;
- tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
- } while ((tmp = tmp->child));
-
+ if (sd)
id = cpumask_first(sched_domain_span(sd));
- }
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_id, cpu) = id;
@@ -6584,7 +6000,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
| 0*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
| 0*SD_WAKE_AFFINE
- | 0*SD_PREFER_LOCAL
| 0*SD_SHARE_CPUPOWER
| 0*SD_SHARE_PKG_RESOURCES
| 1*SD_SERIALIZE
@@ -7248,6 +6663,7 @@ int in_sched_functions(unsigned long addr)
#ifdef CONFIG_CGROUP_SCHED
struct task_group root_task_group;
+LIST_HEAD(task_groups);
#endif
DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
@@ -8381,6 +7797,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
* (balbir@in.ibm.com).
*/
+struct cpuacct root_cpuacct;
+
/* create a new cpu accounting group */
static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp)
{
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index d72586fdf66..23aa789c53e 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -65,8 +65,8 @@ static int convert_prio(int prio)
int cpupri_find(struct cpupri *cp, struct task_struct *p,
struct cpumask *lowest_mask)
{
- int idx = 0;
- int task_pri = convert_prio(p->prio);
+ int idx = 0;
+ int task_pri = convert_prio(p->prio);
if (task_pri >= MAX_RT_PRIO)
return 0;
@@ -137,9 +137,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
*/
void cpupri_set(struct cpupri *cp, int cpu, int newpri)
{
- int *currpri = &cp->cpu_to_pri[cpu];
- int oldpri = *currpri;
- int do_mb = 0;
+ int *currpri = &cp->cpu_to_pri[cpu];
+ int oldpri = *currpri;
+ int do_mb = 0;
newpri = convert_prio(newpri);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
new file mode 100644
index 00000000000..81b763ba58a
--- /dev/null
+++ b/kernel/sched/cputime.c
@@ -0,0 +1,530 @@
+#include <linux/export.h>
+#include <linux/sched.h>
+#include <linux/tsacct_kern.h>
+#include <linux/kernel_stat.h>
+#include <linux/static_key.h>
+#include "sched.h"
+
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in vtime_account, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/vtime_account on this CPU. We would either get old
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
+ */
+DEFINE_PER_CPU(u64, cpu_hardirq_time);
+DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 0;
+}
+
+#ifndef CONFIG_64BIT
+DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+#endif /* CONFIG_64BIT */
+
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
+void vtime_account(struct task_struct *curr)
+{
+ unsigned long flags;
+ s64 delta;
+ int cpu;
+
+ if (!sched_clock_irqtime)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+ __this_cpu_add(irq_start_time, delta);
+
+ irq_time_write_begin();
+ /*
+ * We do not account for softirq time from ksoftirqd here.
+ * We want to continue accounting softirq time to ksoftirqd thread
+ * in that case, so as not to confuse scheduler with a special task
+ * that do not consume any time, but still wants to run.
+ */
+ if (hardirq_count())
+ __this_cpu_add(cpu_hardirq_time, delta);
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ __this_cpu_add(cpu_softirq_time, delta);
+
+ irq_time_write_end();
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(vtime_account);
+
+static int irqtime_account_hi_update(void)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_hardirq_time);
+ if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
+
+static int irqtime_account_si_update(void)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_softirq_time);
+ if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
+
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#define sched_clock_irqtime (0)
+
+#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
+
+static inline void task_group_account_field(struct task_struct *p, int index,
+ u64 tmp)
+{
+#ifdef CONFIG_CGROUP_CPUACCT
+ struct kernel_cpustat *kcpustat;
+ struct cpuacct *ca;
+#endif
+ /*
+ * Since all updates are sure to touch the root cgroup, we
+ * get ourselves ahead and touch it first. If the root cgroup
+ * is the only cgroup, then nothing else should be necessary.
+ *
+ */
+ __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
+
+#ifdef CONFIG_CGROUP_CPUACCT
+ if (unlikely(!cpuacct_subsys.active))
+ return;
+
+ rcu_read_lock();
+ ca = task_ca(p);
+ while (ca && (ca != &root_cpuacct)) {
+ kcpustat = this_cpu_ptr(ca->cpustat);
+ kcpustat->cpustat[index] += tmp;
+ ca = parent_ca(ca);
+ }
+ rcu_read_unlock();
+#endif
+}
+
+/*
+ * Account user cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in user space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+void account_user_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled)
+{
+ int index;
+
+ /* Add user time to process. */
+ p->utime += cputime;
+ p->utimescaled += cputime_scaled;
+ account_group_user_time(p, cputime);
+
+ index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+
+ /* Add user time to cpustat. */
+ task_group_account_field(p, index, (__force u64) cputime);
+
+ /* Account for user time used */
+ acct_update_integrals(p);
+}
+
+/*
+ * Account guest cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+static void account_guest_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ /* Add guest time to process. */
+ p->utime += cputime;
+ p->utimescaled += cputime_scaled;
+ account_group_user_time(p, cputime);
+ p->gtime += cputime;
+
+ /* Add guest time to cpustat. */
+ if (TASK_NICE(p) > 0) {
+ cpustat[CPUTIME_NICE] += (__force u64) cputime;
+ cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
+ } else {
+ cpustat[CPUTIME_USER] += (__force u64) cputime;
+ cpustat[CPUTIME_GUEST] += (__force u64) cputime;
+ }
+}
+
+/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled, int index)
+{
+ /* Add system time to process. */
+ p->stime += cputime;
+ p->stimescaled += cputime_scaled;
+ account_group_system_time(p, cputime);
+
+ /* Add system time to cpustat. */
+ task_group_account_field(p, index, (__force u64) cputime);
+
+ /* Account for system time used */
+ acct_update_integrals(p);
+}
+
+/*
+ * Account system cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @hardirq_offset: the offset to subtract from hardirq_count()
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ */
+void account_system_time(struct task_struct *p, int hardirq_offset,
+ cputime_t cputime, cputime_t cputime_scaled)
+{
+ int index;
+
+ if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
+ account_guest_time(p, cputime, cputime_scaled);
+ return;
+ }
+
+ if (hardirq_count() - hardirq_offset)
+ index = CPUTIME_IRQ;
+ else if (in_serving_softirq())
+ index = CPUTIME_SOFTIRQ;
+ else
+ index = CPUTIME_SYSTEM;
+
+ __account_system_time(p, cputime, cputime_scaled, index);
+}
+
+/*
+ * Account for involuntary wait time.
+ * @cputime: the cpu time spent in involuntary wait
+ */
+void account_steal_time(cputime_t cputime)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ cpustat[CPUTIME_STEAL] += (__force u64) cputime;
+}
+
+/*
+ * Account for idle time.
+ * @cputime: the cpu time spent in idle wait
+ */
+void account_idle_time(cputime_t cputime)
+{
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ struct rq *rq = this_rq();
+
+ if (atomic_read(&rq->nr_iowait) > 0)
+ cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
+ else
+ cpustat[CPUTIME_IDLE] += (__force u64) cputime;
+}
+
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+ if (static_key_false(&paravirt_steal_enabled)) {
+ u64 steal, st = 0;
+
+ steal = paravirt_steal_clock(smp_processor_id());
+ steal -= this_rq()->prev_steal_time;
+
+ st = steal_ticks(steal);
+ this_rq()->prev_steal_time += st * TICK_NSEC;
+
+ account_steal_time(st);
+ return st;
+ }
+#endif
+ return false;
+}
+
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ * - check for guest_time
+ * - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq)
+{
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+ if (steal_account_process_tick())
+ return;
+
+ if (irqtime_account_hi_update()) {
+ cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+ } else if (irqtime_account_si_update()) {
+ cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+ } else if (this_cpu_ksoftirqd() == p) {
+ /*
+ * ksoftirqd time do not get accounted in cpu_softirq_time.
+ * So, we have to handle it separately here.
+ * Also, p->stime needs to be updated for ksoftirqd.
+ */
+ __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+ CPUTIME_SOFTIRQ);
+ } else if (user_tick) {
+ account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ } else if (p == rq->idle) {
+ account_idle_time(cputime_one_jiffy);
+ } else if (p->flags & PF_VCPU) { /* System time or guest time */
+ account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ } else {
+ __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+ CPUTIME_SYSTEM);
+ }
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+ int i;
+ struct rq *rq = this_rq();
+
+ for (i = 0; i < ticks; i++)
+ irqtime_account_process_tick(current, 0, rq);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+/*
+ * Account a single tick of cpu time.
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: indicates if the tick is a user or a system tick
+ */
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ struct rq *rq = this_rq();
+
+ if (sched_clock_irqtime) {
+ irqtime_account_process_tick(p, user_tick, rq);
+ return;
+ }
+
+ if (steal_account_process_tick())
+ return;
+
+ if (user_tick)
+ account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+ account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
+ one_jiffy_scaled);
+ else
+ account_idle_time(cputime_one_jiffy);
+}
+
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+ account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+
+ if (sched_clock_irqtime) {
+ irqtime_account_idle_ticks(ticks);
+ return;
+ }
+
+ account_idle_time(jiffies_to_cputime(ticks));
+}
+
+#endif
+
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ *ut = p->utime;
+ *st = p->stime;
+}
+
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+
+ *ut = cputime.utime;
+ *st = cputime.stime;
+}
+
+/*
+ * Archs that account the whole time spent in the idle task
+ * (outside irq) as idle time can rely on this and just implement
+ * vtime_account_system() and vtime_account_idle(). Archs that
+ * have other meaning of the idle time (s390 only includes the
+ * time spent by the CPU when it's in low power mode) must override
+ * vtime_account().
+ */
+#ifndef __ARCH_HAS_VTIME_ACCOUNT
+void vtime_account(struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (in_interrupt() || !is_idle_task(tsk))
+ vtime_account_system(tsk);
+ else
+ vtime_account_idle(tsk);
+
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(vtime_account);
+#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+
+#else
+
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
+#endif
+
+static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
+{
+ u64 temp = (__force u64) rtime;
+
+ temp *= (__force u64) utime;
+
+ if (sizeof(cputime_t) == 4)
+ temp = div_u64(temp, (__force u32) total);
+ else
+ temp = div64_u64(temp, (__force u64) total);
+
+ return (__force cputime_t) temp;
+}
+
+void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ cputime_t rtime, utime = p->utime, total = utime + p->stime;
+
+ /*
+ * Use CFS's precise accounting:
+ */
+ rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
+
+ if (total)
+ utime = scale_utime(utime, rtime, total);
+ else
+ utime = rtime;
+
+ /*
+ * Compare with previous values, to keep monotonicity:
+ */
+ p->prev_utime = max(p->prev_utime, utime);
+ p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
+
+ *ut = p->prev_utime;
+ *st = p->prev_stime;
+}
+
+/*
+ * Must be called with siglock held.
+ */
+void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct signal_struct *sig = p->signal;
+ struct task_cputime cputime;
+ cputime_t rtime, utime, total;
+
+ thread_group_cputime(p, &cputime);
+
+ total = cputime.utime + cputime.stime;
+ rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
+
+ if (total)
+ utime = scale_utime(cputime.utime, rtime, total);
+ else
+ utime = rtime;
+
+ sig->prev_utime = max(sig->prev_utime, utime);
+ sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
+
+ *ut = sig->prev_utime;
+ *st = sig->prev_stime;
+}
+#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 22321db6495..6b800a14b99 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -597,7 +597,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se)
/*
* The idea is to set a period in which each task runs once.
*
- * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
+ * When there are too many tasks (sched_nr_latency) we have to stretch
* this period because otherwise the slices get too small.
*
* p = (nr <= nl) ? l : l*nr/nl
@@ -2052,7 +2052,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_cancel(&cfs_b->slack_timer);
}
-void unthrottle_offline_cfs_rqs(struct rq *rq)
+static void unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct cfs_rq *cfs_rq;
@@ -2106,7 +2106,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return NULL;
}
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -2637,6 +2637,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
struct sched_domain *sd;
+ struct sched_group *sg;
+ int i;
/*
* If the task is going to be woken-up on this cpu and if it is
@@ -2653,17 +2655,29 @@ static int select_idle_sibling(struct task_struct *p, int target)
return prev_cpu;
/*
- * Otherwise, check assigned siblings to find an elegible idle cpu.
+ * Otherwise, iterate the domains and find an elegible idle cpu.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
-
for_each_lower_domain(sd) {
- if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
- continue;
- if (idle_cpu(sd->idle_buddy))
- return sd->idle_buddy;
- }
+ sg = sd->groups;
+ do {
+ if (!cpumask_intersects(sched_group_cpus(sg),
+ tsk_cpus_allowed(p)))
+ goto next;
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ if (!idle_cpu(i))
+ goto next;
+ }
+
+ target = cpumask_first_and(sched_group_cpus(sg),
+ tsk_cpus_allowed(p));
+ goto done;
+next:
+ sg = sg->next;
+ } while (sg != sd->groups);
+ }
+done:
return target;
}
@@ -2686,7 +2700,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
int prev_cpu = task_cpu(p);
int new_cpu = cpu;
int want_affine = 0;
- int want_sd = 1;
int sync = wake_flags & WF_SYNC;
if (p->nr_cpus_allowed == 1)
@@ -2704,48 +2717,21 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
continue;
/*
- * If power savings logic is enabled for a domain, see if we
- * are not overloaded, if so, don't balance wider.
- */
- if (tmp->flags & (SD_PREFER_LOCAL)) {
- unsigned long power = 0;
- unsigned long nr_running = 0;
- unsigned long capacity;
- int i;
-
- for_each_cpu(i, sched_domain_span(tmp)) {
- power += power_of(i);
- nr_running += cpu_rq(i)->cfs.nr_running;
- }
-
- capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
-
- if (nr_running < capacity)
- want_sd = 0;
- }
-
- /*
* If both cpu and prev_cpu are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
affine_sd = tmp;
- want_affine = 0;
- }
-
- if (!want_sd && !want_affine)
break;
+ }
- if (!(tmp->flags & sd_flag))
- continue;
-
- if (want_sd)
+ if (tmp->flags & sd_flag)
sd = tmp;
}
if (affine_sd) {
- if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
+ if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
prev_cpu = cpu;
new_cpu = select_idle_sibling(p, prev_cpu);
@@ -3069,6 +3055,9 @@ struct lb_env {
int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
+ /* The set of CPUs under consideration for load-balancing */
+ struct cpumask *cpus;
+
unsigned int flags;
unsigned int loop;
@@ -3384,6 +3373,14 @@ static int tg_load_down(struct task_group *tg, void *data)
static void update_h_load(long cpu)
{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long now = jiffies;
+
+ if (rq->h_load_throttle == now)
+ return;
+
+ rq->h_load_throttle = now;
+
rcu_read_lock();
walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
rcu_read_unlock();
@@ -3647,14 +3644,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
* @group: sched_group whose statistics are to be updated.
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
- * @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sgs: variable to hold the statistics for this group.
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
- int local_group, const struct cpumask *cpus,
- int *balance, struct sg_lb_stats *sgs)
+ int local_group, int *balance, struct sg_lb_stats *sgs)
{
unsigned long nr_running, max_nr_running, min_nr_running;
unsigned long load, max_cpu_load, min_cpu_load;
@@ -3671,7 +3666,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
max_nr_running = 0;
min_nr_running = ~0UL;
- for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+ for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);
nr_running = rq->nr_running;
@@ -3795,13 +3790,11 @@ static bool update_sd_pick_busiest(struct lb_env *env,
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
- * @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
*/
static inline void update_sd_lb_stats(struct lb_env *env,
- const struct cpumask *cpus,
- int *balance, struct sd_lb_stats *sds)
+ int *balance, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
@@ -3818,8 +3811,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
memset(&sgs, 0, sizeof(sgs));
- update_sg_lb_stats(env, sg, load_idx, local_group,
- cpus, balance, &sgs);
+ update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
if (local_group && !(*balance))
return;
@@ -4055,7 +4047,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* to restore balance.
*
* @env: The load balancing environment.
- * @cpus: The set of CPUs under consideration for load-balancing.
* @balance: Pointer to a variable indicating if this_cpu
* is the appropriate cpu to perform load balancing at this_level.
*
@@ -4065,7 +4056,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* put to idle by rebalancing its tasks onto our group.
*/
static struct sched_group *
-find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
+find_busiest_group(struct lb_env *env, int *balance)
{
struct sd_lb_stats sds;
@@ -4075,7 +4066,7 @@ find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
* Compute the various statistics relavent for load balancing at
* this level.
*/
- update_sd_lb_stats(env, cpus, balance, &sds);
+ update_sd_lb_stats(env, balance, &sds);
/*
* this_cpu is not the appropriate cpu to perform load balancing at
@@ -4155,8 +4146,7 @@ ret:
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
static struct rq *find_busiest_queue(struct lb_env *env,
- struct sched_group *group,
- const struct cpumask *cpus)
+ struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
@@ -4171,7 +4161,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (!capacity)
capacity = fix_small_capacity(env->sd, group);
- if (!cpumask_test_cpu(i, cpus))
+ if (!cpumask_test_cpu(i, env->cpus))
continue;
rq = cpu_rq(i);
@@ -4252,6 +4242,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.dst_grpmask = sched_group_cpus(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
+ .cpus = cpus,
};
cpumask_copy(cpus, cpu_active_mask);
@@ -4260,7 +4251,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
schedstat_inc(sd, lb_count[idle]);
redo:
- group = find_busiest_group(&env, cpus, balance);
+ group = find_busiest_group(&env, balance);
if (*balance == 0)
goto out_balanced;
@@ -4270,13 +4261,13 @@ redo:
goto out_balanced;
}
- busiest = find_busiest_queue(&env, group, cpus);
+ busiest = find_busiest_queue(&env, group);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
}
- BUG_ON(busiest == this_rq);
+ BUG_ON(busiest == env.dst_rq);
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
@@ -4294,11 +4285,10 @@ redo:
env.src_rq = busiest;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
+ update_h_load(env.src_cpu);
more_balance:
local_irq_save(flags);
- double_rq_lock(this_rq, busiest);
- if (!env.loop)
- update_h_load(env.src_cpu);
+ double_rq_lock(env.dst_rq, busiest);
/*
* cur_ld_moved - load moved in current iteration
@@ -4306,7 +4296,7 @@ more_balance:
*/
cur_ld_moved = move_tasks(&env);
ld_moved += cur_ld_moved;
- double_rq_unlock(this_rq, busiest);
+ double_rq_unlock(env.dst_rq, busiest);
local_irq_restore(flags);
if (env.flags & LBF_NEED_BREAK) {
@@ -4342,8 +4332,7 @@ more_balance:
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
lb_iterations++ < max_lb_iterations) {
- this_rq = cpu_rq(env.new_dst_cpu);
- env.dst_rq = this_rq;
+ env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_SOME_PINNED;
env.loop = 0;
@@ -4628,7 +4617,7 @@ static void nohz_balancer_kick(int cpu)
return;
}
-static inline void clear_nohz_tick_stopped(int cpu)
+static inline void nohz_balance_exit_idle(int cpu)
{
if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
@@ -4668,28 +4657,23 @@ void set_cpu_sd_state_idle(void)
}
/*
- * This routine will record that this cpu is going idle with tick stopped.
+ * This routine will record that the cpu is going idle with tick stopped.
* This info will be used in performing idle load balancing in the future.
*/
-void select_nohz_load_balancer(int stop_tick)
+void nohz_balance_enter_idle(int cpu)
{
- int cpu = smp_processor_id();
-
/*
* If this cpu is going down, then nothing needs to be done.
*/
if (!cpu_active(cpu))
return;
- if (stop_tick) {
- if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
- return;
+ if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
+ return;
- cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
- atomic_inc(&nohz.nr_cpus);
- set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
- }
- return;
+ cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_inc(&nohz.nr_cpus);
+ set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
@@ -4697,7 +4681,7 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
{
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DYING:
- clear_nohz_tick_stopped(smp_processor_id());
+ nohz_balance_exit_idle(smp_processor_id());
return NOTIFY_OK;
default:
return NOTIFY_DONE;
@@ -4819,14 +4803,15 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
if (need_resched())
break;
- raw_spin_lock_irq(&this_rq->lock);
- update_rq_clock(this_rq);
- update_idle_cpu_load(this_rq);
- raw_spin_unlock_irq(&this_rq->lock);
+ rq = cpu_rq(balance_cpu);
+
+ raw_spin_lock_irq(&rq->lock);
+ update_rq_clock(rq);
+ update_idle_cpu_load(rq);
+ raw_spin_unlock_irq(&rq->lock);
rebalance_domains(balance_cpu, CPU_IDLE);
- rq = cpu_rq(balance_cpu);
if (time_after(this_rq->next_balance, rq->next_balance))
this_rq->next_balance = rq->next_balance;
}
@@ -4857,7 +4842,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
* busy tick after returning from idle, we will update the busy stats.
*/
set_cpu_sd_state_busy();
- clear_nohz_tick_stopped(cpu);
+ nohz_balance_exit_idle(cpu);
/*
* None are in tickless mode and hence no need for NOHZ idle load
@@ -4950,6 +4935,9 @@ static void rq_online_fair(struct rq *rq)
static void rq_offline_fair(struct rq *rq)
{
update_sysctl();
+
+ /* Ensure any throttled groups are reachable by pick_next_task */
+ unthrottle_offline_cfs_rqs(rq);
}
#endif /* CONFIG_SMP */
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index de00a486c5c..eebefcad702 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -12,14 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
SCHED_FEAT(START_DEBIT, true)
/*
- * Based on load and program behaviour, see if it makes sense to place
- * a newly woken task on the same cpu as the task that woke it --
- * improve cache locality. Typically used with SYNC wakeups as
- * generated by pipes and the like, see also SYNC_WAKEUPS.
- */
-SCHED_FEAT(AFFINE_WAKEUPS, true)
-
-/*
* Prefer to schedule the task we woke last (assuming it failed
* wakeup-preemption), since its likely going to consume data we
* touched, increases cache locality.
@@ -42,7 +34,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
/*
* Use arch dependent cpu power functions
*/
-SCHED_FEAT(ARCH_POWER, false)
+SCHED_FEAT(ARCH_POWER, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 573e1ca0110..418feb01344 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -691,6 +691,7 @@ balanced:
* runtime - in which case borrowing doesn't make sense.
*/
rt_rq->rt_runtime = RUNTIME_INF;
+ rt_rq->rt_throttled = 0;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -788,6 +789,19 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
const struct cpumask *span;
span = sched_rt_period_mask();
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * FIXME: isolated CPUs should really leave the root task group,
+ * whether they are isolcpus or were isolated via cpusets, lest
+ * the timer run on a CPU which does not service all runqueues,
+ * potentially leaving other CPUs indefinitely throttled. If
+ * isolation is really required, the user will turn the throttle
+ * off to kill the perturbations it causes anyway. Meanwhile,
+ * this maintains functionality for boot and/or troubleshooting.
+ */
+ if (rt_b == &root_task_group.rt_bandwidth)
+ span = cpu_online_mask;
+#endif
for_each_cpu(i, span) {
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
@@ -1618,11 +1632,6 @@ static int push_rt_task(struct rq *rq)
if (!next_task)
return 0;
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- if (unlikely(task_running(rq, next_task)))
- return 0;
-#endif
-
retry:
if (unlikely(next_task == rq->curr)) {
WARN_ON(1);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c35a1a7dd4d..7a7db09cfab 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -80,7 +80,7 @@ extern struct mutex sched_domains_mutex;
struct cfs_rq;
struct rt_rq;
-static LIST_HEAD(task_groups);
+extern struct list_head task_groups;
struct cfs_bandwidth {
#ifdef CONFIG_CFS_BANDWIDTH
@@ -374,7 +374,11 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
-#endif
+#ifdef CONFIG_SMP
+ unsigned long h_load_throttle;
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
#ifdef CONFIG_RT_GROUP_SCHED
struct list_head leaf_rt_rq_list;
#endif
@@ -733,11 +737,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
*/
next->on_cpu = 1;
#endif
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- raw_spin_unlock_irq(&rq->lock);
-#else
raw_spin_unlock(&rq->lock);
-#endif
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
@@ -751,9 +751,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
smp_wmb();
prev->on_cpu = 0;
#endif
-#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_enable();
-#endif
}
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -887,6 +885,9 @@ struct cpuacct {
struct kernel_cpustat __percpu *cpustat;
};
+extern struct cgroup_subsys cpuacct_subsys;
+extern struct cpuacct root_cpuacct;
+
/* return cpu accounting group corresponding to this container */
static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
{
@@ -913,6 +914,16 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
#endif
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
+{
+ if (unlikely(steal > NSEC_PER_SEC))
+ return div_u64(steal, TICK_NSEC);
+
+ return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+
static inline void inc_nr_running(struct rq *rq)
{
rq->nr_running++;
@@ -1140,7 +1151,6 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void unthrottle_offline_cfs_rqs(struct rq *rq);
extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
@@ -1153,3 +1163,53 @@ enum rq_nohz_flag_bits {
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+DECLARE_PER_CPU(u64, cpu_hardirq_time);
+DECLARE_PER_CPU(u64, cpu_softirq_time);
+
+#ifndef CONFIG_64BIT
+DECLARE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+ __this_cpu_inc(irq_time_seq.sequence);
+ smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+ smp_wmb();
+ __this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ u64 irq_time;
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+ irq_time = per_cpu(cpu_softirq_time, cpu) +
+ per_cpu(cpu_hardirq_time, cpu);
+ } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+ return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+ return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+#endif /* CONFIG_64BIT */
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7b386e86fd2..da5eb5bed84 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -27,8 +27,10 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
{
struct task_struct *stop = rq->stop;
- if (stop && stop->on_rq)
+ if (stop && stop->on_rq) {
+ stop->se.exec_start = rq->clock_task;
return stop;
+ }
return NULL;
}
@@ -52,6 +54,21 @@ static void yield_task_stop(struct rq *rq)
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
{
+ struct task_struct *curr = rq->curr;
+ u64 delta_exec;
+
+ delta_exec = rq->clock_task - curr->se.exec_start;
+ if (unlikely((s64)delta_exec < 0))
+ delta_exec = 0;
+
+ schedstat_set(curr->se.statistics.exec_max,
+ max(curr->se.statistics.exec_max, delta_exec));
+
+ curr->se.sum_exec_runtime += delta_exec;
+ account_group_exec_runtime(curr, delta_exec);
+
+ curr->se.exec_start = rq->clock_task;
+ cpuacct_charge(curr, delta_exec);
}
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
@@ -60,6 +77,9 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
static void set_curr_task_stop(struct rq *rq)
{
+ struct task_struct *stop = rq->stop;
+
+ stop->se.exec_start = rq->clock_task;
}
static void switched_to_stop(struct rq *rq, struct task_struct *p)
diff --git a/kernel/signal.c b/kernel/signal.c
index be4f856d52f..2c681f11b7d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1971,13 +1971,8 @@ static void ptrace_do_notify(int signr, int exit_code, int why)
void ptrace_notify(int exit_code)
{
BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP);
- if (unlikely(current->task_works)) {
- if (test_and_clear_ti_thread_flag(current_thread_info(),
- TIF_NOTIFY_RESUME)) {
- smp_mb__after_clear_bit();
- task_work_run();
- }
- }
+ if (unlikely(current->task_works))
+ task_work_run();
spin_lock_irq(&current->sighand->siglock);
ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED);
@@ -2198,13 +2193,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
struct signal_struct *signal = current->signal;
int signr;
- if (unlikely(current->task_works)) {
- if (test_and_clear_ti_thread_flag(current_thread_info(),
- TIF_NOTIFY_RESUME)) {
- smp_mb__after_clear_bit();
- task_work_run();
- }
- }
+ if (unlikely(current->task_works))
+ task_work_run();
if (unlikely(uprobe_deny_signal()))
return 0;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 98f60c5caa1..d6c5fc05424 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -1,14 +1,22 @@
/*
* Common SMP CPU bringup/teardown functions
*/
+#include <linux/cpu.h>
#include <linux/err.h>
#include <linux/smp.h>
#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/export.h>
#include <linux/percpu.h>
+#include <linux/kthread.h>
+#include <linux/smpboot.h>
#include "smpboot.h"
+#ifdef CONFIG_SMP
+
#ifdef CONFIG_GENERIC_SMP_IDLE_THREAD
/*
* For the hotplug case we keep the task structs around and reuse
@@ -65,3 +73,228 @@ void __init idle_threads_init(void)
}
}
#endif
+
+#endif /* #ifdef CONFIG_SMP */
+
+static LIST_HEAD(hotplug_threads);
+static DEFINE_MUTEX(smpboot_threads_lock);
+
+struct smpboot_thread_data {
+ unsigned int cpu;
+ unsigned int status;
+ struct smp_hotplug_thread *ht;
+};
+
+enum {
+ HP_THREAD_NONE = 0,
+ HP_THREAD_ACTIVE,
+ HP_THREAD_PARKED,
+};
+
+/**
+ * smpboot_thread_fn - percpu hotplug thread loop function
+ * @data: thread data pointer
+ *
+ * Checks for thread stop and park conditions. Calls the necessary
+ * setup, cleanup, park and unpark functions for the registered
+ * thread.
+ *
+ * Returns 1 when the thread should exit, 0 otherwise.
+ */
+static int smpboot_thread_fn(void *data)
+{
+ struct smpboot_thread_data *td = data;
+ struct smp_hotplug_thread *ht = td->ht;
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ preempt_disable();
+ if (kthread_should_stop()) {
+ set_current_state(TASK_RUNNING);
+ preempt_enable();
+ if (ht->cleanup)
+ ht->cleanup(td->cpu, cpu_online(td->cpu));
+ kfree(td);
+ return 0;
+ }
+
+ if (kthread_should_park()) {
+ __set_current_state(TASK_RUNNING);
+ preempt_enable();
+ if (ht->park && td->status == HP_THREAD_ACTIVE) {
+ BUG_ON(td->cpu != smp_processor_id());
+ ht->park(td->cpu);
+ td->status = HP_THREAD_PARKED;
+ }
+ kthread_parkme();
+ /* We might have been woken for stop */
+ continue;
+ }
+
+ BUG_ON(td->cpu != smp_processor_id());
+
+ /* Check for state change setup */
+ switch (td->status) {
+ case HP_THREAD_NONE:
+ preempt_enable();
+ if (ht->setup)
+ ht->setup(td->cpu);
+ td->status = HP_THREAD_ACTIVE;
+ preempt_disable();
+ break;
+ case HP_THREAD_PARKED:
+ preempt_enable();
+ if (ht->unpark)
+ ht->unpark(td->cpu);
+ td->status = HP_THREAD_ACTIVE;
+ preempt_disable();
+ break;
+ }
+
+ if (!ht->thread_should_run(td->cpu)) {
+ preempt_enable();
+ schedule();
+ } else {
+ set_current_state(TASK_RUNNING);
+ preempt_enable();
+ ht->thread_fn(td->cpu);
+ }
+ }
+}
+
+static int
+__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+ struct smpboot_thread_data *td;
+
+ if (tsk)
+ return 0;
+
+ td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
+ if (!td)
+ return -ENOMEM;
+ td->cpu = cpu;
+ td->ht = ht;
+
+ tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
+ ht->thread_comm);
+ if (IS_ERR(tsk)) {
+ kfree(td);
+ return PTR_ERR(tsk);
+ }
+
+ get_task_struct(tsk);
+ *per_cpu_ptr(ht->store, cpu) = tsk;
+ return 0;
+}
+
+int smpboot_create_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+ int ret = 0;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry(cur, &hotplug_threads, list) {
+ ret = __smpboot_create_thread(cur, cpu);
+ if (ret)
+ break;
+ }
+ mutex_unlock(&smpboot_threads_lock);
+ return ret;
+}
+
+static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ kthread_unpark(tsk);
+}
+
+void smpboot_unpark_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry(cur, &hotplug_threads, list)
+ smpboot_unpark_thread(cur, cpu);
+ mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
+{
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ if (tsk)
+ kthread_park(tsk);
+}
+
+void smpboot_park_threads(unsigned int cpu)
+{
+ struct smp_hotplug_thread *cur;
+
+ mutex_lock(&smpboot_threads_lock);
+ list_for_each_entry_reverse(cur, &hotplug_threads, list)
+ smpboot_park_thread(cur, cpu);
+ mutex_unlock(&smpboot_threads_lock);
+}
+
+static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
+{
+ unsigned int cpu;
+
+ /* We need to destroy also the parked threads of offline cpus */
+ for_each_possible_cpu(cpu) {
+ struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+
+ if (tsk) {
+ kthread_stop(tsk);
+ put_task_struct(tsk);
+ *per_cpu_ptr(ht->store, cpu) = NULL;
+ }
+ }
+}
+
+/**
+ * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * @plug_thread: Hotplug thread descriptor
+ *
+ * Creates and starts the threads on all online cpus.
+ */
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+ unsigned int cpu;
+ int ret = 0;
+
+ mutex_lock(&smpboot_threads_lock);
+ for_each_online_cpu(cpu) {
+ ret = __smpboot_create_thread(plug_thread, cpu);
+ if (ret) {
+ smpboot_destroy_threads(plug_thread);
+ goto out;
+ }
+ smpboot_unpark_thread(plug_thread, cpu);
+ }
+ list_add(&plug_thread->list, &hotplug_threads);
+out:
+ mutex_unlock(&smpboot_threads_lock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+
+/**
+ * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
+ * @plug_thread: Hotplug thread descriptor
+ *
+ * Stops all threads on all possible cpus.
+ */
+void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
+{
+ get_online_cpus();
+ mutex_lock(&smpboot_threads_lock);
+ list_del(&plug_thread->list);
+ smpboot_destroy_threads(plug_thread);
+ mutex_unlock(&smpboot_threads_lock);
+ put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
index 6ef9433e1c7..72415a0eb95 100644
--- a/kernel/smpboot.h
+++ b/kernel/smpboot.h
@@ -13,4 +13,8 @@ static inline void idle_thread_set_boot_cpu(void) { }
static inline void idle_threads_init(void) { }
#endif
+int smpboot_create_threads(unsigned int cpu);
+void smpboot_park_threads(unsigned int cpu);
+void smpboot_unpark_threads(unsigned int cpu);
+
#endif
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b73e681df09..cc96bdc0c2c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -23,6 +23,7 @@
#include <linux/rcupdate.h>
#include <linux/ftrace.h>
#include <linux/smp.h>
+#include <linux/smpboot.h>
#include <linux/tick.h>
#define CREATE_TRACE_POINTS
@@ -220,7 +221,7 @@ asmlinkage void __do_softirq(void)
current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending();
- account_system_vtime(current);
+ vtime_account(current);
__local_bh_disable((unsigned long)__builtin_return_address(0),
SOFTIRQ_OFFSET);
@@ -271,7 +272,7 @@ restart:
lockdep_softirq_exit();
- account_system_vtime(current);
+ vtime_account(current);
__local_bh_enable(SOFTIRQ_OFFSET);
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}
@@ -340,7 +341,7 @@ static inline void invoke_softirq(void)
*/
void irq_exit(void)
{
- account_system_vtime(current);
+ vtime_account(current);
trace_hardirq_exit();
sub_preempt_count(IRQ_EXIT_OFFSET);
if (!in_interrupt() && local_softirq_pending())
@@ -742,49 +743,22 @@ void __init softirq_init(void)
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
}
-static int run_ksoftirqd(void * __bind_cpu)
+static int ksoftirqd_should_run(unsigned int cpu)
{
- set_current_state(TASK_INTERRUPTIBLE);
-
- while (!kthread_should_stop()) {
- preempt_disable();
- if (!local_softirq_pending()) {
- schedule_preempt_disabled();
- }
-
- __set_current_state(TASK_RUNNING);
-
- while (local_softirq_pending()) {
- /* Preempt disable stops cpu going offline.
- If already offline, we'll be on wrong CPU:
- don't process */
- if (cpu_is_offline((long)__bind_cpu))
- goto wait_to_die;
- local_irq_disable();
- if (local_softirq_pending())
- __do_softirq();
- local_irq_enable();
- sched_preempt_enable_no_resched();
- cond_resched();
- preempt_disable();
- rcu_note_context_switch((long)__bind_cpu);
- }
- preempt_enable();
- set_current_state(TASK_INTERRUPTIBLE);
- }
- __set_current_state(TASK_RUNNING);
- return 0;
+ return local_softirq_pending();
+}
-wait_to_die:
- preempt_enable();
- /* Wait for kthread_stop */
- set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop()) {
- schedule();
- set_current_state(TASK_INTERRUPTIBLE);
+static void run_ksoftirqd(unsigned int cpu)
+{
+ local_irq_disable();
+ if (local_softirq_pending()) {
+ __do_softirq();
+ rcu_note_context_switch(cpu);
+ local_irq_enable();
+ cond_resched();
+ return;
}
- __set_current_state(TASK_RUNNING);
- return 0;
+ local_irq_enable();
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -850,50 +824,14 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
- int hotcpu = (unsigned long)hcpu;
- struct task_struct *p;
-
switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- p = kthread_create_on_node(run_ksoftirqd,
- hcpu,
- cpu_to_node(hotcpu),
- "ksoftirqd/%d", hotcpu);
- if (IS_ERR(p)) {
- printk("ksoftirqd for %i failed\n", hotcpu);
- return notifier_from_errno(PTR_ERR(p));
- }
- kthread_bind(p, hotcpu);
- per_cpu(ksoftirqd, hotcpu) = p;
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- wake_up_process(per_cpu(ksoftirqd, hotcpu));
- break;
#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- if (!per_cpu(ksoftirqd, hotcpu))
- break;
- /* Unbind so it can run. Fall thru. */
- kthread_bind(per_cpu(ksoftirqd, hotcpu),
- cpumask_any(cpu_online_mask));
case CPU_DEAD:
- case CPU_DEAD_FROZEN: {
- static const struct sched_param param = {
- .sched_priority = MAX_RT_PRIO-1
- };
-
- p = per_cpu(ksoftirqd, hotcpu);
- per_cpu(ksoftirqd, hotcpu) = NULL;
- sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
- kthread_stop(p);
- takeover_tasklets(hotcpu);
+ case CPU_DEAD_FROZEN:
+ takeover_tasklets((unsigned long)hcpu);
break;
- }
#endif /* CONFIG_HOTPLUG_CPU */
- }
+ }
return NOTIFY_OK;
}
@@ -901,14 +839,19 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
.notifier_call = cpu_callback
};
+static struct smp_hotplug_thread softirq_threads = {
+ .store = &ksoftirqd,
+ .thread_should_run = ksoftirqd_should_run,
+ .thread_fn = run_ksoftirqd,
+ .thread_comm = "ksoftirqd/%u",
+};
+
static __init int spawn_ksoftirqd(void)
{
- void *cpu = (void *)(long)smp_processor_id();
- int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
-
- BUG_ON(err != NOTIFY_OK);
- cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
+
+ BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
+
return 0;
}
early_initcall(spawn_ksoftirqd);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 87174ef5916..84c76a34e41 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -307,7 +307,7 @@ static struct ctl_table kern_table[] = {
.extra2 = &max_sched_tunable_scaling,
},
{
- .procname = "sched_migration_cost",
+ .procname = "sched_migration_cost_ns",
.data = &sysctl_sched_migration_cost,
.maxlen = sizeof(unsigned int),
.mode = 0644,
@@ -321,14 +321,14 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
{
- .procname = "sched_time_avg",
+ .procname = "sched_time_avg_ms",
.data = &sysctl_sched_time_avg,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
- .procname = "sched_shares_window",
+ .procname = "sched_shares_window_ns",
.data = &sysctl_sched_shares_window,
.maxlen = sizeof(unsigned int),
.mode = 0644,
@@ -1544,7 +1544,7 @@ static struct ctl_table fs_table[] = {
static struct ctl_table debug_table[] = {
#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
- defined(CONFIG_S390) || defined(CONFIG_TILE)
+ defined(CONFIG_S390) || defined(CONFIG_TILE) || defined(CONFIG_ARM64)
{
.procname = "exception-trace",
.data = &show_unhandled_signals,
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 91d4e1742a0..65bd3c92d6f 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -2,26 +2,20 @@
#include <linux/task_work.h>
#include <linux/tracehook.h>
+static struct callback_head work_exited; /* all we need is ->next == NULL */
+
int
-task_work_add(struct task_struct *task, struct callback_head *twork, bool notify)
+task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
{
- struct callback_head *last, *first;
- unsigned long flags;
+ struct callback_head *head;
- /*
- * Not inserting the new work if the task has already passed
- * exit_task_work() is the responisbility of callers.
- */
- raw_spin_lock_irqsave(&task->pi_lock, flags);
- last = task->task_works;
- first = last ? last->next : twork;
- twork->next = first;
- if (last)
- last->next = twork;
- task->task_works = twork;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ do {
+ head = ACCESS_ONCE(task->task_works);
+ if (unlikely(head == &work_exited))
+ return -ESRCH;
+ work->next = head;
+ } while (cmpxchg(&task->task_works, head, work) != head);
- /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */
if (notify)
set_notify_resume(task);
return 0;
@@ -30,51 +24,69 @@ task_work_add(struct task_struct *task, struct callback_head *twork, bool notify
struct callback_head *
task_work_cancel(struct task_struct *task, task_work_func_t func)
{
+ struct callback_head **pprev = &task->task_works;
+ struct callback_head *work = NULL;
unsigned long flags;
- struct callback_head *last, *res = NULL;
-
+ /*
+ * If cmpxchg() fails we continue without updating pprev.
+ * Either we raced with task_work_add() which added the
+ * new entry before this work, we will find it again. Or
+ * we raced with task_work_run(), *pprev == NULL/exited.
+ */
raw_spin_lock_irqsave(&task->pi_lock, flags);
- last = task->task_works;
- if (last) {
- struct callback_head *q = last, *p = q->next;
- while (1) {
- if (p->func == func) {
- q->next = p->next;
- if (p == last)
- task->task_works = q == p ? NULL : q;
- res = p;
- break;
- }
- if (p == last)
- break;
- q = p;
- p = q->next;
- }
+ while ((work = ACCESS_ONCE(*pprev))) {
+ read_barrier_depends();
+ if (work->func != func)
+ pprev = &work->next;
+ else if (cmpxchg(pprev, work, work->next) == work)
+ break;
}
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- return res;
+
+ return work;
}
void task_work_run(void)
{
struct task_struct *task = current;
- struct callback_head *p, *q;
+ struct callback_head *work, *head, *next;
+
+ for (;;) {
+ /*
+ * work->func() can do task_work_add(), do not set
+ * work_exited unless the list is empty.
+ */
+ do {
+ work = ACCESS_ONCE(task->task_works);
+ head = !work && (task->flags & PF_EXITING) ?
+ &work_exited : NULL;
+ } while (cmpxchg(&task->task_works, work, head) != work);
- while (1) {
- raw_spin_lock_irq(&task->pi_lock);
- p = task->task_works;
- task->task_works = NULL;
- raw_spin_unlock_irq(&task->pi_lock);
+ if (!work)
+ break;
+ /*
+ * Synchronize with task_work_cancel(). It can't remove
+ * the first entry == work, cmpxchg(task_works) should
+ * fail, but it can play with *work and other entries.
+ */
+ raw_spin_unlock_wait(&task->pi_lock);
+ smp_mb();
- if (unlikely(!p))
- return;
+ /* Reverse the list to run the works in fifo order */
+ head = NULL;
+ do {
+ next = work->next;
+ work->next = head;
+ head = work;
+ work = next;
+ } while (work);
- q = p->next; /* head */
- p->next = NULL; /* cut it */
- while (q) {
- p = q->next;
- q->func(q);
- q = p;
- }
+ work = head;
+ do {
+ next = work->next;
+ work->func(work);
+ work = next;
+ cond_resched();
+ } while (work);
}
}
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index a470154e040..46da0537c10 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -37,7 +37,7 @@
* requested HZ value. It is also not recommended
* for "tick-less" systems.
*/
-#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
+#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/SHIFTED_HZ))
/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
* conversion, the .shift value could be zero. However
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index b7fbadc5c97..24174b4d669 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -28,7 +28,7 @@ DEFINE_SPINLOCK(ntp_lock);
/* USER_HZ period (usecs): */
unsigned long tick_usec = TICK_USEC;
-/* ACTHZ period (nsecs): */
+/* SHIFTED_HZ period (nsecs): */
unsigned long tick_nsec;
static u64 tick_length;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 024540f97f7..f423bdd035c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -372,7 +372,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
* the scheduler tick in nohz_restart_sched_tick.
*/
if (!ts->tick_stopped) {
- select_nohz_load_balancer(1);
+ nohz_balance_enter_idle(cpu);
calc_load_enter_idle();
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -436,7 +436,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
static int ratelimit;
- if (ratelimit < 10) {
+ if (ratelimit < 10 &&
+ (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
(unsigned int) local_softirq_pending());
ratelimit++;
@@ -569,10 +570,10 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
{
/* Update jiffies first */
- select_nohz_load_balancer(0);
tick_do_update_jiffies64(now);
update_cpu_load_nohz();
+ calc_load_exit_idle();
touch_softlockup_watchdog();
/*
* Cancel the scheduled timer and restore the tick
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f045cc50832..d3b91e75cec 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -65,14 +65,14 @@ struct timekeeper {
* used instead.
*/
struct timespec wall_to_monotonic;
- /* time spent in suspend */
- struct timespec total_sleep_time;
- /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
- struct timespec raw_time;
/* Offset clock monotonic -> clock realtime */
ktime_t offs_real;
+ /* time spent in suspend */
+ struct timespec total_sleep_time;
/* Offset clock monotonic -> clock boottime */
ktime_t offs_boot;
+ /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
+ struct timespec raw_time;
/* Seqlock for all timekeeper values */
seqlock_t lock;
};
@@ -108,13 +108,39 @@ static struct timespec tk_xtime(struct timekeeper *tk)
static void tk_set_xtime(struct timekeeper *tk, const struct timespec *ts)
{
tk->xtime_sec = ts->tv_sec;
- tk->xtime_nsec = ts->tv_nsec << tk->shift;
+ tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift;
}
static void tk_xtime_add(struct timekeeper *tk, const struct timespec *ts)
{
tk->xtime_sec += ts->tv_sec;
- tk->xtime_nsec += ts->tv_nsec << tk->shift;
+ tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift;
+ tk_normalize_xtime(tk);
+}
+
+static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
+{
+ struct timespec tmp;
+
+ /*
+ * Verify consistency of: offset_real = -wall_to_monotonic
+ * before modifying anything
+ */
+ set_normalized_timespec(&tmp, -tk->wall_to_monotonic.tv_sec,
+ -tk->wall_to_monotonic.tv_nsec);
+ WARN_ON_ONCE(tk->offs_real.tv64 != timespec_to_ktime(tmp).tv64);
+ tk->wall_to_monotonic = wtm;
+ set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
+ tk->offs_real = timespec_to_ktime(tmp);
+}
+
+static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
+{
+ /* Verify consistency before modifying */
+ WARN_ON_ONCE(tk->offs_boot.tv64 != timespec_to_ktime(tk->total_sleep_time).tv64);
+
+ tk->total_sleep_time = t;
+ tk->offs_boot = timespec_to_ktime(t);
}
/**
@@ -217,14 +243,6 @@ static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
return nsec + arch_gettimeoffset();
}
-static void update_rt_offset(struct timekeeper *tk)
-{
- struct timespec tmp, *wtm = &tk->wall_to_monotonic;
-
- set_normalized_timespec(&tmp, -wtm->tv_sec, -wtm->tv_nsec);
- tk->offs_real = timespec_to_ktime(tmp);
-}
-
/* must hold write on timekeeper.lock */
static void timekeeping_update(struct timekeeper *tk, bool clearntp)
{
@@ -234,12 +252,10 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
tk->ntp_error = 0;
ntp_clear();
}
- update_rt_offset(tk);
xt = tk_xtime(tk);
update_vsyscall(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
}
-
/**
* timekeeping_forward_now - update clock to the current time
*
@@ -261,7 +277,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
tk->xtime_nsec += cycle_delta * tk->mult;
/* If arch requires, add in gettimeoffset() */
- tk->xtime_nsec += arch_gettimeoffset() << tk->shift;
+ tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift;
tk_normalize_xtime(tk);
@@ -277,38 +293,39 @@ static void timekeeping_forward_now(struct timekeeper *tk)
*/
void getnstimeofday(struct timespec *ts)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long seq;
s64 nsecs = 0;
WARN_ON(timekeeping_suspended);
do {
- seq = read_seqbegin(&timekeeper.lock);
+ seq = read_seqbegin(&tk->lock);
- ts->tv_sec = timekeeper.xtime_sec;
- ts->tv_nsec = timekeeping_get_ns(&timekeeper);
+ ts->tv_sec = tk->xtime_sec;
+ nsecs = timekeeping_get_ns(tk);
- } while (read_seqretry(&timekeeper.lock, seq));
+ } while (read_seqretry(&tk->lock, seq));
+ ts->tv_nsec = 0;
timespec_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(getnstimeofday);
ktime_t ktime_get(void)
{
+ struct timekeeper *tk = &timekeeper;
unsigned int seq;
s64 secs, nsecs;
WARN_ON(timekeeping_suspended);
do {
- seq = read_seqbegin(&timekeeper.lock);
- secs = timekeeper.xtime_sec +
- timekeeper.wall_to_monotonic.tv_sec;
- nsecs = timekeeping_get_ns(&timekeeper) +
- timekeeper.wall_to_monotonic.tv_nsec;
+ seq = read_seqbegin(&tk->lock);
+ secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
+ nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
- } while (read_seqretry(&timekeeper.lock, seq));
+ } while (read_seqretry(&tk->lock, seq));
/*
* Use ktime_set/ktime_add_ns to create a proper ktime on
* 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -327,21 +344,24 @@ EXPORT_SYMBOL_GPL(ktime_get);
*/
void ktime_get_ts(struct timespec *ts)
{
+ struct timekeeper *tk = &timekeeper;
struct timespec tomono;
+ s64 nsec;
unsigned int seq;
WARN_ON(timekeeping_suspended);
do {
- seq = read_seqbegin(&timekeeper.lock);
- ts->tv_sec = timekeeper.xtime_sec;
- ts->tv_nsec = timekeeping_get_ns(&timekeeper);
- tomono = timekeeper.wall_to_monotonic;
+ seq = read_seqbegin(&tk->lock);
+ ts->tv_sec = tk->xtime_sec;
+ nsec = timekeeping_get_ns(tk);
+ tomono = tk->wall_to_monotonic;
- } while (read_seqretry(&timekeeper.lock, seq));
+ } while (read_seqretry(&tk->lock, seq));
- set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
- ts->tv_nsec + tomono.tv_nsec);
+ ts->tv_sec += tomono.tv_sec;
+ ts->tv_nsec = 0;
+ timespec_add_ns(ts, nsec + tomono.tv_nsec);
}
EXPORT_SYMBOL_GPL(ktime_get_ts);
@@ -358,22 +378,23 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
*/
void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long seq;
s64 nsecs_raw, nsecs_real;
WARN_ON_ONCE(timekeeping_suspended);
do {
- seq = read_seqbegin(&timekeeper.lock);
+ seq = read_seqbegin(&tk->lock);
- *ts_raw = timekeeper.raw_time;
- ts_real->tv_sec = timekeeper.xtime_sec;
+ *ts_raw = tk->raw_time;
+ ts_real->tv_sec = tk->xtime_sec;
ts_real->tv_nsec = 0;
- nsecs_raw = timekeeping_get_ns_raw(&timekeeper);
- nsecs_real = timekeeping_get_ns(&timekeeper);
+ nsecs_raw = timekeeping_get_ns_raw(tk);
+ nsecs_real = timekeeping_get_ns(tk);
- } while (read_seqretry(&timekeeper.lock, seq));
+ } while (read_seqretry(&tk->lock, seq));
timespec_add_ns(ts_raw, nsecs_raw);
timespec_add_ns(ts_real, nsecs_real);
@@ -406,28 +427,28 @@ EXPORT_SYMBOL(do_gettimeofday);
*/
int do_settimeofday(const struct timespec *tv)
{
+ struct timekeeper *tk = &timekeeper;
struct timespec ts_delta, xt;
unsigned long flags;
- if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+ if (!timespec_valid_strict(tv))
return -EINVAL;
- write_seqlock_irqsave(&timekeeper.lock, flags);
+ write_seqlock_irqsave(&tk->lock, flags);
- timekeeping_forward_now(&timekeeper);
+ timekeeping_forward_now(tk);
- xt = tk_xtime(&timekeeper);
+ xt = tk_xtime(tk);
ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
- timekeeper.wall_to_monotonic =
- timespec_sub(timekeeper.wall_to_monotonic, ts_delta);
+ tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, ts_delta));
- tk_set_xtime(&timekeeper, tv);
+ tk_set_xtime(tk, tv);
- timekeeping_update(&timekeeper, true);
+ timekeeping_update(tk, true);
- write_sequnlock_irqrestore(&timekeeper.lock, flags);
+ write_sequnlock_irqrestore(&tk->lock, flags);
/* signal hrtimers about time change */
clock_was_set();
@@ -436,7 +457,6 @@ int do_settimeofday(const struct timespec *tv)
}
EXPORT_SYMBOL(do_settimeofday);
-
/**
* timekeeping_inject_offset - Adds or subtracts from the current time.
* @tv: pointer to the timespec variable containing the offset
@@ -445,28 +465,37 @@ EXPORT_SYMBOL(do_settimeofday);
*/
int timekeeping_inject_offset(struct timespec *ts)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long flags;
+ struct timespec tmp;
+ int ret = 0;
if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
- write_seqlock_irqsave(&timekeeper.lock, flags);
+ write_seqlock_irqsave(&tk->lock, flags);
- timekeeping_forward_now(&timekeeper);
+ timekeeping_forward_now(tk);
+ /* Make sure the proposed value is valid */
+ tmp = timespec_add(tk_xtime(tk), *ts);
+ if (!timespec_valid_strict(&tmp)) {
+ ret = -EINVAL;
+ goto error;
+ }
- tk_xtime_add(&timekeeper, ts);
- timekeeper.wall_to_monotonic =
- timespec_sub(timekeeper.wall_to_monotonic, *ts);
+ tk_xtime_add(tk, ts);
+ tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
- timekeeping_update(&timekeeper, true);
+error: /* even if we error out, we forwarded the time, so call update */
+ timekeeping_update(tk, true);
- write_sequnlock_irqrestore(&timekeeper.lock, flags);
+ write_sequnlock_irqrestore(&tk->lock, flags);
/* signal hrtimers about time change */
clock_was_set();
- return 0;
+ return ret;
}
EXPORT_SYMBOL(timekeeping_inject_offset);
@@ -477,23 +506,24 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
*/
static int change_clocksource(void *data)
{
+ struct timekeeper *tk = &timekeeper;
struct clocksource *new, *old;
unsigned long flags;
new = (struct clocksource *) data;
- write_seqlock_irqsave(&timekeeper.lock, flags);
+ write_seqlock_irqsave(&tk->lock, flags);
- timekeeping_forward_now(&timekeeper);
+ timekeeping_forward_now(tk);
if (!new->enable || new->enable(new) == 0) {
- old = timekeeper.clock;
- tk_setup_internals(&timekeeper, new);
+ old = tk->clock;
+ tk_setup_internals(tk, new);
if (old->disable)
old->disable(old);
}
- timekeeping_update(&timekeeper, true);
+ timekeeping_update(tk, true);
- write_sequnlock_irqrestore(&timekeeper.lock, flags);
+ write_sequnlock_irqrestore(&tk->lock, flags);
return 0;
}
@@ -507,7 +537,9 @@ static int change_clocksource(void *data)
*/
void timekeeping_notify(struct clocksource *clock)
{
- if (timekeeper.clock == clock)
+ struct timekeeper *tk = &timekeeper;
+
+ if (tk->clock == clock)
return;
stop_machine(change_clocksource, clock, NULL);
tick_clock_notify();
@@ -536,35 +568,36 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
*/
void getrawmonotonic(struct timespec *ts)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long seq;
s64 nsecs;
do {
- seq = read_seqbegin(&timekeeper.lock);
- nsecs = timekeeping_get_ns_raw(&timekeeper);
- *ts = timekeeper.raw_time;
+ seq = read_seqbegin(&tk->lock);
+ nsecs = timekeeping_get_ns_raw(tk);
+ *ts = tk->raw_time;
- } while (read_seqretry(&timekeeper.lock, seq));
+ } while (read_seqretry(&tk->lock, seq));
timespec_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(getrawmonotonic);
-
/**
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
*/
int timekeeping_valid_for_hres(void)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long seq;
int ret;
do {
- seq = read_seqbegin(&timekeeper.lock);
+ seq = read_seqbegin(&tk->lock);
- ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+ ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
- } while (read_seqretry(&timekeeper.lock, seq));
+ } while (read_seqretry(&tk->lock, seq));
return ret;
}
@@ -574,15 +607,16 @@ int timekeeping_valid_for_hres(void)
*/
u64 timekeeping_max_deferment(void)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long seq;
u64 ret;
do {
- seq = read_seqbegin(&timekeeper.lock);
+ seq = read_seqbegin(&tk->lock);
- ret = timekeeper.clock->max_idle_ns;
+ ret = tk->clock->max_idle_ns;
- } while (read_seqretry(&timekeeper.lock, seq));
+ } while (read_seqretry(&tk->lock, seq));
return ret;
}
@@ -622,46 +656,56 @@ void __attribute__((weak)) read_boot_clock(struct timespec *ts)
*/
void __init timekeeping_init(void)
{
+ struct timekeeper *tk = &timekeeper;
struct clocksource *clock;
unsigned long flags;
- struct timespec now, boot;
+ struct timespec now, boot, tmp;
read_persistent_clock(&now);
+ if (!timespec_valid_strict(&now)) {
+ pr_warn("WARNING: Persistent clock returned invalid value!\n"
+ " Check your CMOS/BIOS settings.\n");
+ now.tv_sec = 0;
+ now.tv_nsec = 0;
+ }
+
read_boot_clock(&boot);
+ if (!timespec_valid_strict(&boot)) {
+ pr_warn("WARNING: Boot clock returned invalid value!\n"
+ " Check your CMOS/BIOS settings.\n");
+ boot.tv_sec = 0;
+ boot.tv_nsec = 0;
+ }
- seqlock_init(&timekeeper.lock);
+ seqlock_init(&tk->lock);
ntp_init();
- write_seqlock_irqsave(&timekeeper.lock, flags);
+ write_seqlock_irqsave(&tk->lock, flags);
clock = clocksource_default_clock();
if (clock->enable)
clock->enable(clock);
- tk_setup_internals(&timekeeper, clock);
+ tk_setup_internals(tk, clock);
- tk_set_xtime(&timekeeper, &now);
- timekeeper.raw_time.tv_sec = 0;
- timekeeper.raw_time.tv_nsec = 0;
+ tk_set_xtime(tk, &now);
+ tk->raw_time.tv_sec = 0;
+ tk->raw_time.tv_nsec = 0;
if (boot.tv_sec == 0 && boot.tv_nsec == 0)
- boot = tk_xtime(&timekeeper);
-
- set_normalized_timespec(&timekeeper.wall_to_monotonic,
- -boot.tv_sec, -boot.tv_nsec);
- update_rt_offset(&timekeeper);
- timekeeper.total_sleep_time.tv_sec = 0;
- timekeeper.total_sleep_time.tv_nsec = 0;
- write_sequnlock_irqrestore(&timekeeper.lock, flags);
+ boot = tk_xtime(tk);
+
+ set_normalized_timespec(&tmp, -boot.tv_sec, -boot.tv_nsec);
+ tk_set_wall_to_mono(tk, tmp);
+
+ tmp.tv_sec = 0;
+ tmp.tv_nsec = 0;
+ tk_set_sleep_time(tk, tmp);
+
+ write_sequnlock_irqrestore(&tk->lock, flags);
}
/* time in seconds when suspend began */
static struct timespec timekeeping_suspend_time;
-static void update_sleep_time(struct timespec t)
-{
- timekeeper.total_sleep_time = t;
- timekeeper.offs_boot = timespec_to_ktime(t);
-}
-
/**
* __timekeeping_inject_sleeptime - Internal function to add sleep interval
* @delta: pointer to a timespec delta value
@@ -672,18 +716,16 @@ static void update_sleep_time(struct timespec t)
static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
struct timespec *delta)
{
- if (!timespec_valid(delta)) {
+ if (!timespec_valid_strict(delta)) {
printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid "
"sleep delta value!\n");
return;
}
-
tk_xtime_add(tk, delta);
- tk->wall_to_monotonic = timespec_sub(tk->wall_to_monotonic, *delta);
- update_sleep_time(timespec_add(tk->total_sleep_time, *delta));
+ tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *delta));
+ tk_set_sleep_time(tk, timespec_add(tk->total_sleep_time, *delta));
}
-
/**
* timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
* @delta: pointer to a timespec delta value
@@ -696,6 +738,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
*/
void timekeeping_inject_sleeptime(struct timespec *delta)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long flags;
struct timespec ts;
@@ -704,21 +747,20 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
return;
- write_seqlock_irqsave(&timekeeper.lock, flags);
+ write_seqlock_irqsave(&tk->lock, flags);
- timekeeping_forward_now(&timekeeper);
+ timekeeping_forward_now(tk);
- __timekeeping_inject_sleeptime(&timekeeper, delta);
+ __timekeeping_inject_sleeptime(tk, delta);
- timekeeping_update(&timekeeper, true);
+ timekeeping_update(tk, true);
- write_sequnlock_irqrestore(&timekeeper.lock, flags);
+ write_sequnlock_irqrestore(&tk->lock, flags);
/* signal hrtimers about time change */
clock_was_set();
}
-
/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
*
@@ -728,6 +770,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
*/
static void timekeeping_resume(void)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long flags;
struct timespec ts;
@@ -735,18 +778,18 @@ static void timekeeping_resume(void)
clocksource_resume();
- write_seqlock_irqsave(&timekeeper.lock, flags);
+ write_seqlock_irqsave(&tk->lock, flags);
if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
ts = timespec_sub(ts, timekeeping_suspend_time);
- __timekeeping_inject_sleeptime(&timekeeper, &ts);
+ __timekeeping_inject_sleeptime(tk, &ts);
}
/* re-base the last cycle value */
- timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
- timekeeper.ntp_error = 0;
+ tk->clock->cycle_last = tk->clock->read(tk->clock);
+ tk->ntp_error = 0;
timekeeping_suspended = 0;
- timekeeping_update(&timekeeper, false);
- write_sequnlock_irqrestore(&timekeeper.lock, flags);
+ timekeeping_update(tk, false);
+ write_sequnlock_irqrestore(&tk->lock, flags);
touch_softlockup_watchdog();
@@ -758,14 +801,15 @@ static void timekeeping_resume(void)
static int timekeeping_suspend(void)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long flags;
struct timespec delta, delta_delta;
static struct timespec old_delta;
read_persistent_clock(&timekeeping_suspend_time);
- write_seqlock_irqsave(&timekeeper.lock, flags);
- timekeeping_forward_now(&timekeeper);
+ write_seqlock_irqsave(&tk->lock, flags);
+ timekeeping_forward_now(tk);
timekeeping_suspended = 1;
/*
@@ -774,7 +818,7 @@ static int timekeeping_suspend(void)
* try to compensate so the difference in system time
* and persistent_clock time stays close to constant.
*/
- delta = timespec_sub(tk_xtime(&timekeeper), timekeeping_suspend_time);
+ delta = timespec_sub(tk_xtime(tk), timekeeping_suspend_time);
delta_delta = timespec_sub(delta, old_delta);
if (abs(delta_delta.tv_sec) >= 2) {
/*
@@ -787,7 +831,7 @@ static int timekeeping_suspend(void)
timekeeping_suspend_time =
timespec_add(timekeeping_suspend_time, delta_delta);
}
- write_sequnlock_irqrestore(&timekeeper.lock, flags);
+ write_sequnlock_irqrestore(&tk->lock, flags);
clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
clocksource_suspend();
@@ -898,27 +942,29 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
* the error. This causes the likely below to be unlikely.
*
* The proper fix is to avoid rounding up by using
- * the high precision timekeeper.xtime_nsec instead of
+ * the high precision tk->xtime_nsec instead of
* xtime.tv_nsec everywhere. Fixing this will take some
* time.
*/
if (likely(error <= interval))
adj = 1;
else
- adj = timekeeping_bigadjust(tk, error, &interval,
- &offset);
- } else if (error < -interval) {
- /* See comment above, this is just switched for the negative */
- error >>= 2;
- if (likely(error >= -interval)) {
- adj = -1;
- interval = -interval;
- offset = -offset;
- } else
- adj = timekeeping_bigadjust(tk, error, &interval,
- &offset);
- } else
- return;
+ adj = timekeeping_bigadjust(tk, error, &interval, &offset);
+ } else {
+ if (error < -interval) {
+ /* See comment above, this is just switched for the negative */
+ error >>= 2;
+ if (likely(error >= -interval)) {
+ adj = -1;
+ interval = -interval;
+ offset = -offset;
+ } else {
+ adj = timekeeping_bigadjust(tk, error, &interval, &offset);
+ }
+ } else {
+ goto out_adjust;
+ }
+ }
if (unlikely(tk->clock->maxadj &&
(tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
@@ -981,6 +1027,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
tk->xtime_nsec -= offset;
tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
+out_adjust:
/*
* It may be possible that when we entered this function, xtime_nsec
* was very small. Further, if we're slightly speeding the clocksource
@@ -1003,7 +1050,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
}
-
/**
* accumulate_nsecs_to_secs - Accumulates nsecs into secs
*
@@ -1024,15 +1070,21 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
/* Figure out if its a leap sec and apply if needed */
leap = second_overflow(tk->xtime_sec);
- tk->xtime_sec += leap;
- tk->wall_to_monotonic.tv_sec -= leap;
- if (leap)
- clock_was_set_delayed();
+ if (unlikely(leap)) {
+ struct timespec ts;
+
+ tk->xtime_sec += leap;
+
+ ts.tv_sec = leap;
+ ts.tv_nsec = 0;
+ tk_set_wall_to_mono(tk,
+ timespec_sub(tk->wall_to_monotonic, ts));
+ clock_was_set_delayed();
+ }
}
}
-
/**
* logarithmic_accumulation - shifted accumulation of cycles
*
@@ -1076,7 +1128,6 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
return offset;
}
-
/**
* update_wall_time - Uses the current clocksource to increment the wall time
*
@@ -1084,25 +1135,30 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
static void update_wall_time(void)
{
struct clocksource *clock;
+ struct timekeeper *tk = &timekeeper;
cycle_t offset;
int shift = 0, maxshift;
unsigned long flags;
s64 remainder;
- write_seqlock_irqsave(&timekeeper.lock, flags);
+ write_seqlock_irqsave(&tk->lock, flags);
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
goto out;
- clock = timekeeper.clock;
+ clock = tk->clock;
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
- offset = timekeeper.cycle_interval;
+ offset = tk->cycle_interval;
#else
offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
#endif
+ /* Check if there's really nothing to do */
+ if (offset < tk->cycle_interval)
+ goto out;
+
/*
* With NO_HZ we may have to accumulate many cycle_intervals
* (think "ticks") worth of time at once. To do this efficiently,
@@ -1111,19 +1167,19 @@ static void update_wall_time(void)
* chunk in one go, and then try to consume the next smaller
* doubled multiple.
*/
- shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
+ shift = ilog2(offset) - ilog2(tk->cycle_interval);
shift = max(0, shift);
/* Bound shift to one less than what overflows tick_length */
maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
shift = min(shift, maxshift);
- while (offset >= timekeeper.cycle_interval) {
- offset = logarithmic_accumulation(&timekeeper, offset, shift);
- if(offset < timekeeper.cycle_interval<<shift)
+ while (offset >= tk->cycle_interval) {
+ offset = logarithmic_accumulation(tk, offset, shift);
+ if (offset < tk->cycle_interval<<shift)
shift--;
}
/* correct the clock when NTP error is too big */
- timekeeping_adjust(&timekeeper, offset);
+ timekeeping_adjust(tk, offset);
/*
@@ -1135,21 +1191,21 @@ static void update_wall_time(void)
* the vsyscall implementations are converted to use xtime_nsec
* (shifted nanoseconds), this can be killed.
*/
- remainder = timekeeper.xtime_nsec & ((1 << timekeeper.shift) - 1);
- timekeeper.xtime_nsec -= remainder;
- timekeeper.xtime_nsec += 1 << timekeeper.shift;
- timekeeper.ntp_error += remainder << timekeeper.ntp_error_shift;
+ remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
+ tk->xtime_nsec -= remainder;
+ tk->xtime_nsec += 1ULL << tk->shift;
+ tk->ntp_error += remainder << tk->ntp_error_shift;
/*
* Finally, make sure that after the rounding
* xtime_nsec isn't larger than NSEC_PER_SEC
*/
- accumulate_nsecs_to_secs(&timekeeper);
+ accumulate_nsecs_to_secs(tk);
- timekeeping_update(&timekeeper, false);
+ timekeeping_update(tk, false);
out:
- write_sequnlock_irqrestore(&timekeeper.lock, flags);
+ write_sequnlock_irqrestore(&tk->lock, flags);
}
@@ -1166,18 +1222,18 @@ out:
*/
void getboottime(struct timespec *ts)
{
+ struct timekeeper *tk = &timekeeper;
struct timespec boottime = {
- .tv_sec = timekeeper.wall_to_monotonic.tv_sec +
- timekeeper.total_sleep_time.tv_sec,
- .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec +
- timekeeper.total_sleep_time.tv_nsec
+ .tv_sec = tk->wall_to_monotonic.tv_sec +
+ tk->total_sleep_time.tv_sec,
+ .tv_nsec = tk->wall_to_monotonic.tv_nsec +
+ tk->total_sleep_time.tv_nsec
};
set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
}
EXPORT_SYMBOL_GPL(getboottime);
-
/**
* get_monotonic_boottime - Returns monotonic time since boot
* @ts: pointer to the timespec to be set
@@ -1189,22 +1245,25 @@ EXPORT_SYMBOL_GPL(getboottime);
*/
void get_monotonic_boottime(struct timespec *ts)
{
+ struct timekeeper *tk = &timekeeper;
struct timespec tomono, sleep;
+ s64 nsec;
unsigned int seq;
WARN_ON(timekeeping_suspended);
do {
- seq = read_seqbegin(&timekeeper.lock);
- ts->tv_sec = timekeeper.xtime_sec;
- ts->tv_nsec = timekeeping_get_ns(&timekeeper);
- tomono = timekeeper.wall_to_monotonic;
- sleep = timekeeper.total_sleep_time;
+ seq = read_seqbegin(&tk->lock);
+ ts->tv_sec = tk->xtime_sec;
+ nsec = timekeeping_get_ns(tk);
+ tomono = tk->wall_to_monotonic;
+ sleep = tk->total_sleep_time;
- } while (read_seqretry(&timekeeper.lock, seq));
+ } while (read_seqretry(&tk->lock, seq));
- set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
- ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec);
+ ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
+ ts->tv_nsec = 0;
+ timespec_add_ns(ts, nsec + tomono.tv_nsec + sleep.tv_nsec);
}
EXPORT_SYMBOL_GPL(get_monotonic_boottime);
@@ -1231,31 +1290,38 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime);
*/
void monotonic_to_bootbased(struct timespec *ts)
{
- *ts = timespec_add(*ts, timekeeper.total_sleep_time);
+ struct timekeeper *tk = &timekeeper;
+
+ *ts = timespec_add(*ts, tk->total_sleep_time);
}
EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
unsigned long get_seconds(void)
{
- return timekeeper.xtime_sec;
+ struct timekeeper *tk = &timekeeper;
+
+ return tk->xtime_sec;
}
EXPORT_SYMBOL(get_seconds);
struct timespec __current_kernel_time(void)
{
- return tk_xtime(&timekeeper);
+ struct timekeeper *tk = &timekeeper;
+
+ return tk_xtime(tk);
}
struct timespec current_kernel_time(void)
{
+ struct timekeeper *tk = &timekeeper;
struct timespec now;
unsigned long seq;
do {
- seq = read_seqbegin(&timekeeper.lock);
+ seq = read_seqbegin(&tk->lock);
- now = tk_xtime(&timekeeper);
- } while (read_seqretry(&timekeeper.lock, seq));
+ now = tk_xtime(tk);
+ } while (read_seqretry(&tk->lock, seq));
return now;
}
@@ -1263,15 +1329,16 @@ EXPORT_SYMBOL(current_kernel_time);
struct timespec get_monotonic_coarse(void)
{
+ struct timekeeper *tk = &timekeeper;
struct timespec now, mono;
unsigned long seq;
do {
- seq = read_seqbegin(&timekeeper.lock);
+ seq = read_seqbegin(&tk->lock);
- now = tk_xtime(&timekeeper);
- mono = timekeeper.wall_to_monotonic;
- } while (read_seqretry(&timekeeper.lock, seq));
+ now = tk_xtime(tk);
+ mono = tk->wall_to_monotonic;
+ } while (read_seqretry(&tk->lock, seq));
set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
now.tv_nsec + mono.tv_nsec);
@@ -1300,14 +1367,15 @@ void do_timer(unsigned long ticks)
void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
struct timespec *wtom, struct timespec *sleep)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long seq;
do {
- seq = read_seqbegin(&timekeeper.lock);
- *xtim = tk_xtime(&timekeeper);
- *wtom = timekeeper.wall_to_monotonic;
- *sleep = timekeeper.total_sleep_time;
- } while (read_seqretry(&timekeeper.lock, seq));
+ seq = read_seqbegin(&tk->lock);
+ *xtim = tk_xtime(tk);
+ *wtom = tk->wall_to_monotonic;
+ *sleep = tk->total_sleep_time;
+ } while (read_seqretry(&tk->lock, seq));
}
#ifdef CONFIG_HIGH_RES_TIMERS
@@ -1321,19 +1389,20 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
*/
ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
{
+ struct timekeeper *tk = &timekeeper;
ktime_t now;
unsigned int seq;
u64 secs, nsecs;
do {
- seq = read_seqbegin(&timekeeper.lock);
+ seq = read_seqbegin(&tk->lock);
- secs = timekeeper.xtime_sec;
- nsecs = timekeeping_get_ns(&timekeeper);
+ secs = tk->xtime_sec;
+ nsecs = timekeeping_get_ns(tk);
- *offs_real = timekeeper.offs_real;
- *offs_boot = timekeeper.offs_boot;
- } while (read_seqretry(&timekeeper.lock, seq));
+ *offs_real = tk->offs_real;
+ *offs_boot = tk->offs_boot;
+ } while (read_seqretry(&tk->lock, seq));
now = ktime_add_ns(ktime_set(secs, 0), nsecs);
now = ktime_sub(now, *offs_real);
@@ -1346,19 +1415,19 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
*/
ktime_t ktime_get_monotonic_offset(void)
{
+ struct timekeeper *tk = &timekeeper;
unsigned long seq;
struct timespec wtom;
do {
- seq = read_seqbegin(&timekeeper.lock);
- wtom = timekeeper.wall_to_monotonic;
- } while (read_seqretry(&timekeeper.lock, seq));
+ seq = read_seqbegin(&tk->lock);
+ wtom = tk->wall_to_monotonic;
+ } while (read_seqretry(&tk->lock, seq));
return timespec_to_ktime(wtom);
}
EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
-
/**
* xtime_update() - advances the timekeeping infrastructure
* @ticks: number of ticks, that have elapsed since the last call.
diff --git a/kernel/timer.c b/kernel/timer.c
index a61c09374eb..d5de1b2292a 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -92,24 +92,25 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
/* Functions below help us manage 'deferrable' flag */
static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
{
- return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
+ return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE);
}
-static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
+static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
{
- return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
+ return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
}
-static inline void timer_set_deferrable(struct timer_list *timer)
+static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
{
- timer->base = TBASE_MAKE_DEFERRED(timer->base);
+ return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
}
static inline void
timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
{
- timer->base = (struct tvec_base *)((unsigned long)(new_base) |
- tbase_get_deferrable(timer->base));
+ unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK;
+
+ timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags);
}
static unsigned long round_jiffies_common(unsigned long j, int cpu,
@@ -563,16 +564,14 @@ static inline void debug_timer_assert_init(struct timer_list *timer)
debug_object_assert_init(timer, &timer_debug_descr);
}
-static void __init_timer(struct timer_list *timer,
- const char *name,
- struct lock_class_key *key);
+static void do_init_timer(struct timer_list *timer, unsigned int flags,
+ const char *name, struct lock_class_key *key);
-void init_timer_on_stack_key(struct timer_list *timer,
- const char *name,
- struct lock_class_key *key)
+void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
+ const char *name, struct lock_class_key *key)
{
debug_object_init_on_stack(timer, &timer_debug_descr);
- __init_timer(timer, name, key);
+ do_init_timer(timer, flags, name, key);
}
EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
@@ -613,12 +612,13 @@ static inline void debug_assert_init(struct timer_list *timer)
debug_timer_assert_init(timer);
}
-static void __init_timer(struct timer_list *timer,
- const char *name,
- struct lock_class_key *key)
+static void do_init_timer(struct timer_list *timer, unsigned int flags,
+ const char *name, struct lock_class_key *key)
{
+ struct tvec_base *base = __raw_get_cpu_var(tvec_bases);
+
timer->entry.next = NULL;
- timer->base = __raw_get_cpu_var(tvec_bases);
+ timer->base = (void *)((unsigned long)base | flags);
timer->slack = -1;
#ifdef CONFIG_TIMER_STATS
timer->start_site = NULL;
@@ -628,22 +628,10 @@ static void __init_timer(struct timer_list *timer,
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
-void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
- const char *name,
- struct lock_class_key *key,
- void (*function)(unsigned long),
- unsigned long data)
-{
- timer->function = function;
- timer->data = data;
- init_timer_on_stack_key(timer, name, key);
- timer_set_deferrable(timer);
-}
-EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
-
/**
* init_timer_key - initialize a timer
* @timer: the timer to be initialized
+ * @flags: timer flags
* @name: name of the timer
* @key: lockdep class key of the fake lock used for tracking timer
* sync lock dependencies
@@ -651,24 +639,14 @@ EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
* init_timer_key() must be done to a timer prior calling *any* of the
* other timer functions.
*/
-void init_timer_key(struct timer_list *timer,
- const char *name,
- struct lock_class_key *key)
+void init_timer_key(struct timer_list *timer, unsigned int flags,
+ const char *name, struct lock_class_key *key)
{
debug_init(timer);
- __init_timer(timer, name, key);
+ do_init_timer(timer, flags, name, key);
}
EXPORT_SYMBOL(init_timer_key);
-void init_timer_deferrable_key(struct timer_list *timer,
- const char *name,
- struct lock_class_key *key)
-{
- init_timer_key(timer, name, key);
- timer_set_deferrable(timer);
-}
-EXPORT_SYMBOL(init_timer_deferrable_key);
-
static inline void detach_timer(struct timer_list *timer, bool clear_pending)
{
struct list_head *entry = &timer->entry;
@@ -686,7 +664,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
{
detach_timer(timer, true);
if (!tbase_get_deferrable(timer->base))
- timer->base->active_timers--;
+ base->active_timers--;
}
static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
@@ -697,7 +675,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
detach_timer(timer, clear_pending);
if (!tbase_get_deferrable(timer->base)) {
- timer->base->active_timers--;
+ base->active_timers--;
if (timer->expires == base->next_timer)
base->next_timer = base->timer_jiffies;
}
@@ -1029,14 +1007,14 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
*
* Synchronization rules: Callers must prevent restarting of the timer,
* otherwise this function is meaningless. It must not be called from
- * interrupt contexts. The caller must not hold locks which would prevent
- * completion of the timer's handler. The timer's handler must not call
- * add_timer_on(). Upon exit the timer is not queued and the handler is
- * not running on any CPU.
+ * interrupt contexts unless the timer is an irqsafe one. The caller must
+ * not hold locks which would prevent completion of the timer's
+ * handler. The timer's handler must not call add_timer_on(). Upon exit the
+ * timer is not queued and the handler is not running on any CPU.
*
- * Note: You must not hold locks that are held in interrupt context
- * while calling this function. Even if the lock has nothing to do
- * with the timer in question. Here's why:
+ * Note: For !irqsafe timers, you must not hold locks that are held in
+ * interrupt context while calling this function. Even if the lock has
+ * nothing to do with the timer in question. Here's why:
*
* CPU0 CPU1
* ---- ----
@@ -1073,7 +1051,7 @@ int del_timer_sync(struct timer_list *timer)
* don't use it in hardirq context, because it
* could lead to deadlock.
*/
- WARN_ON(in_irq());
+ WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base));
for (;;) {
int ret = try_to_del_timer_sync(timer);
if (ret >= 0)
@@ -1180,19 +1158,27 @@ static inline void __run_timers(struct tvec_base *base)
while (!list_empty(head)) {
void (*fn)(unsigned long);
unsigned long data;
+ bool irqsafe;
timer = list_first_entry(head, struct timer_list,entry);
fn = timer->function;
data = timer->data;
+ irqsafe = tbase_get_irqsafe(timer->base);
timer_stats_account_timer(timer);
base->running_timer = timer;
detach_expired_timer(timer, base);
- spin_unlock_irq(&base->lock);
- call_timer_fn(timer, fn, data);
- spin_lock_irq(&base->lock);
+ if (irqsafe) {
+ spin_unlock(&base->lock);
+ call_timer_fn(timer, fn, data);
+ spin_lock(&base->lock);
+ } else {
+ spin_unlock_irq(&base->lock);
+ call_timer_fn(timer, fn, data);
+ spin_lock_irq(&base->lock);
+ }
}
}
base->running_timer = NULL;
@@ -1407,13 +1393,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
#endif
-#ifndef __alpha__
-
-/*
- * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this
- * should be moved into arch/i386 instead?
- */
-
/**
* sys_getpid - return the thread group id of the current process
*
@@ -1469,8 +1448,6 @@ SYSCALL_DEFINE0(getegid)
return from_kgid_munged(current_user_ns(), current_egid());
}
-#endif
-
static void process_timeout(unsigned long __data)
{
wake_up_process((struct task_struct *)__data);
@@ -1800,9 +1777,13 @@ static struct notifier_block __cpuinitdata timers_nb = {
void __init init_timers(void)
{
- int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
- (void *)(long)smp_processor_id());
+ int err;
+
+ /* ensure there are enough low bits for flags in timer->base pointer */
+ BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
+ err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
+ (void *)(long)smp_processor_id());
init_timer_stats();
BUG_ON(err != NOTIFY_OK);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8c4c07071cc..4cea4f41c1d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS
help
See Documentation/trace/ftrace-design.txt
+config HAVE_FENTRY
+ bool
+ help
+ Arch supports the gcc options -pg with -mfentry
+
config HAVE_C_RECORDMCOUNT
bool
help
@@ -57,8 +62,12 @@ config HAVE_C_RECORDMCOUNT
config TRACER_MAX_TRACE
bool
+config TRACE_CLOCK
+ bool
+
config RING_BUFFER
bool
+ select TRACE_CLOCK
config FTRACE_NMI_ENTER
bool
@@ -109,6 +118,7 @@ config TRACING
select NOP_TRACER
select BINARY_PRINTF
select EVENT_TRACING
+ select TRACE_CLOCK
config GENERIC_TRACER
bool
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index b831087c820..d7e2068e4b7 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -5,10 +5,12 @@ ifdef CONFIG_FUNCTION_TRACER
ORIG_CFLAGS := $(KBUILD_CFLAGS)
KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
+ifdef CONFIG_FTRACE_SELFTEST
# selftest needs instrumentation
CFLAGS_trace_selftest_dynamic.o = -pg
obj-y += trace_selftest_dynamic.o
endif
+endif
# If unlikely tracing is enabled, do not trace these files
ifdef CONFIG_TRACING_BRANCHES
@@ -17,11 +19,7 @@ endif
CFLAGS_trace_events_filter.o := -I$(src)
-#
-# Make the trace clocks available generally: it's infrastructure
-# relied on by ptrace for example:
-#
-obj-y += trace_clock.o
+obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o
obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b4f20fba09f..9dcf15d3838 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -64,12 +64,20 @@
#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
+static struct ftrace_ops ftrace_list_end __read_mostly = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+};
+
/* ftrace_enabled is a method to turn ftrace on or off */
int ftrace_enabled __read_mostly;
static int last_ftrace_enabled;
/* Quick disabling of function tracer. */
-int function_trace_stop;
+int function_trace_stop __read_mostly;
+
+/* Current function tracing op */
+struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
/* List for set_ftrace_pid's pids. */
LIST_HEAD(ftrace_pids);
@@ -86,22 +94,43 @@ static int ftrace_disabled __read_mostly;
static DEFINE_MUTEX(ftrace_lock);
-static struct ftrace_ops ftrace_list_end __read_mostly = {
- .func = ftrace_stub,
-};
-
static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
-static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub;
-ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
static struct ftrace_ops global_ops;
static struct ftrace_ops control_ops;
-static void
-ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
+#if ARCH_SUPPORTS_FTRACE_OPS
+static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs);
+#else
+/* See comment below, where ftrace_ops_list_func is defined */
+static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
+#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
+#endif
+
+/**
+ * ftrace_nr_registered_ops - return number of ops registered
+ *
+ * Returns the number of ftrace_ops registered and tracing functions
+ */
+int ftrace_nr_registered_ops(void)
+{
+ struct ftrace_ops *ops;
+ int cnt = 0;
+
+ mutex_lock(&ftrace_lock);
+
+ for (ops = ftrace_ops_list;
+ ops != &ftrace_list_end; ops = ops->next)
+ cnt++;
+
+ mutex_unlock(&ftrace_lock);
+
+ return cnt;
+}
/*
* Traverse the ftrace_global_list, invoking all entries. The reason that we
@@ -112,29 +141,29 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
*
* Silly Alpha and silly pointer-speculation compiler optimizations!
*/
-static void ftrace_global_list_func(unsigned long ip,
- unsigned long parent_ip)
+static void
+ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs)
{
- struct ftrace_ops *op;
-
if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
return;
trace_recursion_set(TRACE_GLOBAL_BIT);
op = rcu_dereference_raw(ftrace_global_list); /*see above*/
while (op != &ftrace_list_end) {
- op->func(ip, parent_ip);
+ op->func(ip, parent_ip, op, regs);
op = rcu_dereference_raw(op->next); /*see above*/
};
trace_recursion_clear(TRACE_GLOBAL_BIT);
}
-static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
+static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs)
{
if (!test_tsk_trace_trace(current))
return;
- ftrace_pid_function(ip, parent_ip);
+ ftrace_pid_function(ip, parent_ip, op, regs);
}
static void set_ftrace_pid_function(ftrace_func_t func)
@@ -153,25 +182,9 @@ static void set_ftrace_pid_function(ftrace_func_t func)
void clear_ftrace_function(void)
{
ftrace_trace_function = ftrace_stub;
- __ftrace_trace_function = ftrace_stub;
- __ftrace_trace_function_delay = ftrace_stub;
ftrace_pid_function = ftrace_stub;
}
-#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
-/*
- * For those archs that do not test ftrace_trace_stop in their
- * mcount call site, we need to do it from C.
- */
-static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
-{
- if (function_trace_stop)
- return;
-
- __ftrace_trace_function(ip, parent_ip);
-}
-#endif
-
static void control_ops_disable_all(struct ftrace_ops *ops)
{
int cpu;
@@ -230,28 +243,27 @@ static void update_ftrace_function(void)
/*
* If we are at the end of the list and this ops is
- * not dynamic, then have the mcount trampoline call
- * the function directly
+ * recursion safe and not dynamic and the arch supports passing ops,
+ * then have the mcount trampoline call the function directly.
*/
if (ftrace_ops_list == &ftrace_list_end ||
(ftrace_ops_list->next == &ftrace_list_end &&
- !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC)))
+ !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) &&
+ (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
+ !FTRACE_FORCE_LIST_FUNC)) {
+ /* Set the ftrace_ops that the arch callback uses */
+ if (ftrace_ops_list == &global_ops)
+ function_trace_op = ftrace_global_list;
+ else
+ function_trace_op = ftrace_ops_list;
func = ftrace_ops_list->func;
- else
+ } else {
+ /* Just use the default ftrace_ops */
+ function_trace_op = &ftrace_list_end;
func = ftrace_ops_list_func;
+ }
-#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
ftrace_trace_function = func;
-#else
-#ifdef CONFIG_DYNAMIC_FTRACE
- /* do not update till all functions have been modified */
- __ftrace_trace_function_delay = func;
-#else
- __ftrace_trace_function = func;
-#endif
- ftrace_trace_function =
- (func == ftrace_stub) ? func : ftrace_test_stop_func;
-#endif
}
static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
@@ -325,6 +337,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
return -EINVAL;
+#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+ /*
+ * If the ftrace_ops specifies SAVE_REGS, then it only can be used
+ * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
+ * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant.
+ */
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS &&
+ !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED))
+ return -EINVAL;
+
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)
+ ops->flags |= FTRACE_OPS_FL_SAVE_REGS;
+#endif
+
if (!core_kernel_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
@@ -773,7 +799,8 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
}
static void
-function_profile_call(unsigned long ip, unsigned long parent_ip)
+function_profile_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
{
struct ftrace_profile_stat *stat;
struct ftrace_profile *rec;
@@ -803,7 +830,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static int profile_graph_entry(struct ftrace_graph_ent *trace)
{
- function_profile_call(trace->func, 0);
+ function_profile_call(trace->func, 0, NULL, NULL);
return 1;
}
@@ -863,6 +890,7 @@ static void unregister_ftrace_profiler(void)
#else
static struct ftrace_ops ftrace_profile_ops __read_mostly = {
.func = function_profile_call,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static int register_ftrace_profiler(void)
@@ -1045,6 +1073,7 @@ static struct ftrace_ops global_ops = {
.func = ftrace_stub,
.notrace_hash = EMPTY_HASH,
.filter_hash = EMPTY_HASH,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static DEFINE_MUTEX(ftrace_regex_lock);
@@ -1525,6 +1554,12 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
rec->flags++;
if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
return;
+ /*
+ * If any ops wants regs saved for this function
+ * then all ops will get saved regs.
+ */
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+ rec->flags |= FTRACE_FL_REGS;
} else {
if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
return;
@@ -1616,18 +1651,59 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
if (enable && (rec->flags & ~FTRACE_FL_MASK))
flag = FTRACE_FL_ENABLED;
+ /*
+ * If enabling and the REGS flag does not match the REGS_EN, then
+ * do not ignore this record. Set flags to fail the compare against
+ * ENABLED.
+ */
+ if (flag &&
+ (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)))
+ flag |= FTRACE_FL_REGS;
+
/* If the state of this record hasn't changed, then do nothing */
if ((rec->flags & FTRACE_FL_ENABLED) == flag)
return FTRACE_UPDATE_IGNORE;
if (flag) {
- if (update)
+ /* Save off if rec is being enabled (for return value) */
+ flag ^= rec->flags & FTRACE_FL_ENABLED;
+
+ if (update) {
rec->flags |= FTRACE_FL_ENABLED;
- return FTRACE_UPDATE_MAKE_CALL;
+ if (flag & FTRACE_FL_REGS) {
+ if (rec->flags & FTRACE_FL_REGS)
+ rec->flags |= FTRACE_FL_REGS_EN;
+ else
+ rec->flags &= ~FTRACE_FL_REGS_EN;
+ }
+ }
+
+ /*
+ * If this record is being updated from a nop, then
+ * return UPDATE_MAKE_CALL.
+ * Otherwise, if the EN flag is set, then return
+ * UPDATE_MODIFY_CALL_REGS to tell the caller to convert
+ * from the non-save regs, to a save regs function.
+ * Otherwise,
+ * return UPDATE_MODIFY_CALL to tell the caller to convert
+ * from the save regs, to a non-save regs function.
+ */
+ if (flag & FTRACE_FL_ENABLED)
+ return FTRACE_UPDATE_MAKE_CALL;
+ else if (rec->flags & FTRACE_FL_REGS_EN)
+ return FTRACE_UPDATE_MODIFY_CALL_REGS;
+ else
+ return FTRACE_UPDATE_MODIFY_CALL;
}
- if (update)
- rec->flags &= ~FTRACE_FL_ENABLED;
+ if (update) {
+ /* If there's no more users, clear all flags */
+ if (!(rec->flags & ~FTRACE_FL_MASK))
+ rec->flags = 0;
+ else
+ /* Just disable the record (keep REGS state) */
+ rec->flags &= ~FTRACE_FL_ENABLED;
+ }
return FTRACE_UPDATE_MAKE_NOP;
}
@@ -1662,13 +1738,17 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
static int
__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
{
+ unsigned long ftrace_old_addr;
unsigned long ftrace_addr;
int ret;
- ftrace_addr = (unsigned long)FTRACE_ADDR;
-
ret = ftrace_update_record(rec, enable);
+ if (rec->flags & FTRACE_FL_REGS)
+ ftrace_addr = (unsigned long)FTRACE_REGS_ADDR;
+ else
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
+
switch (ret) {
case FTRACE_UPDATE_IGNORE:
return 0;
@@ -1678,6 +1758,15 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
case FTRACE_UPDATE_MAKE_NOP:
return ftrace_make_nop(NULL, rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MODIFY_CALL_REGS:
+ case FTRACE_UPDATE_MODIFY_CALL:
+ if (rec->flags & FTRACE_FL_REGS)
+ ftrace_old_addr = (unsigned long)FTRACE_ADDR;
+ else
+ ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR;
+
+ return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
}
return -1; /* unknow ftrace bug */
@@ -1882,16 +1971,6 @@ static void ftrace_run_update_code(int command)
*/
arch_ftrace_update_code(command);
-#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
- /*
- * For archs that call ftrace_test_stop_func(), we must
- * wait till after we update all the function callers
- * before we update the callback. This keeps different
- * ops that record different functions from corrupting
- * each other.
- */
- __ftrace_trace_function = __ftrace_trace_function_delay;
-#endif
function_trace_stop--;
ret = ftrace_arch_code_modify_post_process();
@@ -2441,8 +2520,9 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, "%ps", (void *)rec->ip);
if (iter->flags & FTRACE_ITER_ENABLED)
- seq_printf(m, " (%ld)",
- rec->flags & ~FTRACE_FL_MASK);
+ seq_printf(m, " (%ld)%s",
+ rec->flags & ~FTRACE_FL_MASK,
+ rec->flags & FTRACE_FL_REGS ? " R" : "");
seq_printf(m, "\n");
return 0;
@@ -2790,8 +2870,8 @@ static int __init ftrace_mod_cmd_init(void)
}
device_initcall(ftrace_mod_cmd_init);
-static void
-function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
+static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
{
struct ftrace_func_probe *entry;
struct hlist_head *hhd;
@@ -3162,8 +3242,27 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
}
static int
-ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
- int reset, int enable)
+ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
+{
+ struct ftrace_func_entry *entry;
+
+ if (!ftrace_location(ip))
+ return -EINVAL;
+
+ if (remove) {
+ entry = ftrace_lookup_ip(hash, ip);
+ if (!entry)
+ return -ENOENT;
+ free_hash_entry(hash, entry);
+ return 0;
+ }
+
+ return add_hash_entry(hash, ip);
+}
+
+static int
+ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
+ unsigned long ip, int remove, int reset, int enable)
{
struct ftrace_hash **orig_hash;
struct ftrace_hash *hash;
@@ -3192,6 +3291,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
ret = -EINVAL;
goto out_regex_unlock;
}
+ if (ip) {
+ ret = ftrace_match_addr(hash, ip, remove);
+ if (ret < 0)
+ goto out_regex_unlock;
+ }
mutex_lock(&ftrace_lock);
ret = ftrace_hash_move(ops, enable, orig_hash, hash);
@@ -3208,6 +3312,37 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
return ret;
}
+static int
+ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
+ int reset, int enable)
+{
+ return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable);
+}
+
+/**
+ * ftrace_set_filter_ip - set a function to filter on in ftrace by address
+ * @ops - the ops to set the filter with
+ * @ip - the address to add to or remove from the filter.
+ * @remove - non zero to remove the ip from the filter
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled
+ * If @ip is NULL, it failes to update filter.
+ */
+int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
+ int remove, int reset)
+{
+ return ftrace_set_addr(ops, ip, remove, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
+
+static int
+ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
+ int reset, int enable)
+{
+ return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
+}
+
/**
* ftrace_set_filter - set a function to filter on in ftrace
* @ops - the ops to set the filter with
@@ -3912,6 +4047,7 @@ void __init ftrace_init(void)
static struct ftrace_ops global_ops = {
.func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static int __init ftrace_nodyn_init(void)
@@ -3942,10 +4078,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
#endif /* CONFIG_DYNAMIC_FTRACE */
static void
-ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip)
+ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs)
{
- struct ftrace_ops *op;
-
if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT)))
return;
@@ -3959,7 +4094,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip)
while (op != &ftrace_list_end) {
if (!ftrace_function_local_disabled(op) &&
ftrace_ops_test(op, ip))
- op->func(ip, parent_ip);
+ op->func(ip, parent_ip, op, regs);
op = rcu_dereference_raw(op->next);
};
@@ -3969,13 +4104,18 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops control_ops = {
.func = ftrace_ops_control_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
-static void
-ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
+static inline void
+__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ignored, struct pt_regs *regs)
{
struct ftrace_ops *op;
+ if (function_trace_stop)
+ return;
+
if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
return;
@@ -3988,13 +4128,39 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
op = rcu_dereference_raw(ftrace_ops_list);
while (op != &ftrace_list_end) {
if (ftrace_ops_test(op, ip))
- op->func(ip, parent_ip);
+ op->func(ip, parent_ip, op, regs);
op = rcu_dereference_raw(op->next);
};
preempt_enable_notrace();
trace_recursion_clear(TRACE_INTERNAL_BIT);
}
+/*
+ * Some archs only support passing ip and parent_ip. Even though
+ * the list function ignores the op parameter, we do not want any
+ * C side effects, where a function is called without the caller
+ * sending a third parameter.
+ * Archs are to support both the regs and ftrace_ops at the same time.
+ * If they support ftrace_ops, it is assumed they support regs.
+ * If call backs want to use regs, they must either check for regs
+ * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS.
+ * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved.
+ * An architecture can pass partial regs with ftrace_ops and still
+ * set the ARCH_SUPPORT_FTARCE_OPS.
+ */
+#if ARCH_SUPPORTS_FTRACE_OPS
+static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs)
+{
+ __ftrace_ops_list_func(ip, parent_ip, NULL, regs);
+}
+#else
+static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
+{
+ __ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
+}
+#endif
+
static void clear_ftrace_swapper(void)
{
struct task_struct *p;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 49491fa7daa..b32ed0e385a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2816,7 +2816,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
* to the buffer after this will fail and return NULL.
*
* This is different than ring_buffer_record_disable() as
- * it works like an on/off switch, where as the disable() verison
+ * it works like an on/off switch, where as the disable() version
* must be paired with a enable().
*/
void ring_buffer_record_off(struct ring_buffer *buffer)
@@ -2839,7 +2839,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_off);
* ring_buffer_record_off().
*
* This is different than ring_buffer_record_enable() as
- * it works like an on/off switch, where as the enable() verison
+ * it works like an on/off switch, where as the enable() version
* must be paired with a disable().
*/
void ring_buffer_record_on(struct ring_buffer *buffer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5c38c81496c..1ec5c1dab62 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -328,7 +328,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
- TRACE_ITER_IRQ_INFO;
+ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS;
static int trace_stop_count;
static DEFINE_RAW_SPINLOCK(tracing_start_lock);
@@ -426,15 +426,15 @@ __setup("trace_buf_size=", set_buf_size);
static int __init set_tracing_thresh(char *str)
{
- unsigned long threshhold;
+ unsigned long threshold;
int ret;
if (!str)
return 0;
- ret = strict_strtoul(str, 0, &threshhold);
+ ret = strict_strtoul(str, 0, &threshold);
if (ret < 0)
return 0;
- tracing_thresh = threshhold * 1000;
+ tracing_thresh = threshold * 1000;
return 1;
}
__setup("tracing_thresh=", set_tracing_thresh);
@@ -470,6 +470,7 @@ static const char *trace_options[] = {
"overwrite",
"disable_on_free",
"irq-info",
+ "markers",
NULL
};
@@ -3886,6 +3887,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (tracing_disabled)
return -EINVAL;
+ if (!(trace_flags & TRACE_ITER_MARKERS))
+ return -EINVAL;
+
if (cnt > TRACE_BUF_SIZE)
cnt = TRACE_BUF_SIZE;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 55e1f7f0db1..63a2da0b9a6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -472,11 +472,11 @@ extern void trace_find_cmdline(int pid, char comm[]);
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
+#endif
#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
extern int DYN_FTRACE_TEST_NAME(void);
#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
extern int DYN_FTRACE_TEST_NAME2(void);
-#endif
extern int ring_buffer_expanded;
extern bool tracing_selftest_disabled;
@@ -680,6 +680,7 @@ enum trace_iterator_flags {
TRACE_ITER_OVERWRITE = 0x200000,
TRACE_ITER_STOP_ON_FREE = 0x400000,
TRACE_ITER_IRQ_INFO = 0x800000,
+ TRACE_ITER_MARKERS = 0x1000000,
};
/*
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index fee3752ae8f..84b1e045fab 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -258,7 +258,8 @@ EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
#ifdef CONFIG_FUNCTION_TRACER
static void
-perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip)
+perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct pt_regs *pt_regs)
{
struct ftrace_entry *entry;
struct hlist_head *head;
@@ -281,7 +282,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip)
head = this_cpu_ptr(event_function.perf_events);
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
- 1, &regs, head);
+ 1, &regs, head, NULL);
#undef ENTRY_SIZE
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 29111da1d10..d608d09d08c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1199,6 +1199,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
return 0;
}
+static void event_remove(struct ftrace_event_call *call)
+{
+ ftrace_event_enable_disable(call, 0);
+ if (call->event.funcs)
+ __unregister_ftrace_event(&call->event);
+ list_del(&call->list);
+}
+
+static int event_init(struct ftrace_event_call *call)
+{
+ int ret = 0;
+
+ if (WARN_ON(!call->name))
+ return -EINVAL;
+
+ if (call->class->raw_init) {
+ ret = call->class->raw_init(call);
+ if (ret < 0 && ret != -ENOSYS)
+ pr_warn("Could not initialize trace events/%s\n",
+ call->name);
+ }
+
+ return ret;
+}
+
static int
__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
const struct file_operations *id,
@@ -1209,19 +1234,9 @@ __trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
struct dentry *d_events;
int ret;
- /* The linker may leave blanks */
- if (!call->name)
- return -EINVAL;
-
- if (call->class->raw_init) {
- ret = call->class->raw_init(call);
- if (ret < 0) {
- if (ret != -ENOSYS)
- pr_warning("Could not initialize trace events/%s\n",
- call->name);
- return ret;
- }
- }
+ ret = event_init(call);
+ if (ret < 0)
+ return ret;
d_events = event_trace_events_dir();
if (!d_events)
@@ -1272,13 +1287,10 @@ static void remove_subsystem_dir(const char *name)
*/
static void __trace_remove_event_call(struct ftrace_event_call *call)
{
- ftrace_event_enable_disable(call, 0);
- if (call->event.funcs)
- __unregister_ftrace_event(&call->event);
- debugfs_remove_recursive(call->dir);
- list_del(&call->list);
+ event_remove(call);
trace_destroy_fields(call);
destroy_preds(call);
+ debugfs_remove_recursive(call->dir);
remove_subsystem_dir(call->class->system);
}
@@ -1450,15 +1462,43 @@ static __init int setup_trace_event(char *str)
}
__setup("trace_event=", setup_trace_event);
+static __init int event_trace_enable(void)
+{
+ struct ftrace_event_call **iter, *call;
+ char *buf = bootup_event_buf;
+ char *token;
+ int ret;
+
+ for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) {
+
+ call = *iter;
+ ret = event_init(call);
+ if (!ret)
+ list_add(&call->list, &ftrace_events);
+ }
+
+ while (true) {
+ token = strsep(&buf, ",");
+
+ if (!token)
+ break;
+ if (!*token)
+ continue;
+
+ ret = ftrace_set_clr_event(token, 1);
+ if (ret)
+ pr_warn("Failed to enable trace event: %s\n", token);
+ }
+ return 0;
+}
+
static __init int event_trace_init(void)
{
- struct ftrace_event_call **call;
+ struct ftrace_event_call *call;
struct dentry *d_tracer;
struct dentry *entry;
struct dentry *d_events;
int ret;
- char *buf = bootup_event_buf;
- char *token;
d_tracer = tracing_init_dentry();
if (!d_tracer)
@@ -1497,24 +1537,19 @@ static __init int event_trace_init(void)
if (trace_define_common_fields())
pr_warning("tracing: Failed to allocate common fields");
- for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
- __trace_add_event_call(*call, NULL, &ftrace_event_id_fops,
+ /*
+ * Early initialization already enabled ftrace event.
+ * Now it's only necessary to create the event directory.
+ */
+ list_for_each_entry(call, &ftrace_events, list) {
+
+ ret = event_create_dir(call, d_events,
+ &ftrace_event_id_fops,
&ftrace_enable_fops,
&ftrace_event_filter_fops,
&ftrace_event_format_fops);
- }
-
- while (true) {
- token = strsep(&buf, ",");
-
- if (!token)
- break;
- if (!*token)
- continue;
-
- ret = ftrace_set_clr_event(token, 1);
- if (ret)
- pr_warning("Failed to enable trace event: %s\n", token);
+ if (ret < 0)
+ event_remove(call);
}
ret = register_module_notifier(&trace_module_nb);
@@ -1523,6 +1558,7 @@ static __init int event_trace_init(void)
return 0;
}
+core_initcall(event_trace_enable);
fs_initcall(event_trace_init);
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1646,9 +1682,11 @@ static __init void event_trace_self_tests(void)
event_test_stuff();
ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
- if (WARN_ON_ONCE(ret))
+ if (WARN_ON_ONCE(ret)) {
pr_warning("error disabling system %s\n",
system->name);
+ continue;
+ }
pr_cont("OK\n");
}
@@ -1681,7 +1719,8 @@ static __init void event_trace_self_tests(void)
static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
static void
-function_test_events_call(unsigned long ip, unsigned long parent_ip)
+function_test_events_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
{
struct ring_buffer_event *event;
struct ring_buffer *buffer;
@@ -1720,6 +1759,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops trace_ops __initdata =
{
.func = function_test_events_call,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static __init void event_trace_self_test_with_function(void)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 431dba8b754..c154797a7ff 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -2002,7 +2002,7 @@ static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter,
static int __ftrace_function_set_filter(int filter, char *buf, int len,
struct function_filter_data *data)
{
- int i, re_cnt, ret;
+ int i, re_cnt, ret = -EINVAL;
int *reset;
char **re;
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index a426f410c06..483162a9f90 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -49,7 +49,8 @@ static void function_trace_start(struct trace_array *tr)
}
static void
-function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
+function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
{
struct trace_array *tr = func_trace;
struct trace_array_cpu *data;
@@ -84,7 +85,9 @@ enum {
static struct tracer_flags func_flags;
static void
-function_trace_call(unsigned long ip, unsigned long parent_ip)
+function_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
+
{
struct trace_array *tr = func_trace;
struct trace_array_cpu *data;
@@ -121,7 +124,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
}
static void
-function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
+function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
{
struct trace_array *tr = func_trace;
struct trace_array_cpu *data;
@@ -164,13 +168,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops trace_ops __read_mostly =
{
.func = function_trace_call,
- .flags = FTRACE_OPS_FL_GLOBAL,
+ .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
};
static struct ftrace_ops trace_stack_ops __read_mostly =
{
.func = function_stack_trace_call,
- .flags = FTRACE_OPS_FL_GLOBAL,
+ .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
};
static struct tracer_opt func_opts[] = {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ce27c8ba8d3..99b4378393d 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -143,7 +143,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
return;
}
-#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST
+#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY)
/*
* The arch may choose to record the frame pointer used
* and check it here to make sure that it is what we expect it
@@ -154,6 +154,9 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,
*
* Currently, x86_32 with optimize for size (-Os) makes the latest
* gcc do the above.
+ *
+ * Note, -mfentry does not use frame pointers, and this test
+ * is not needed if CC_USING_FENTRY is set.
*/
if (unlikely(current->ret_stack[index].fp != frame_pointer)) {
ftrace_graph_stop();
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 99d20e92036..d98ee8283b2 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -136,7 +136,8 @@ static int func_prolog_dec(struct trace_array *tr,
* irqsoff uses its own tracer function to keep the overhead down:
*/
static void
-irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
@@ -153,7 +154,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops trace_ops __read_mostly =
{
.func = irqsoff_tracer_call,
- .flags = FTRACE_OPS_FL_GLOBAL,
+ .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
};
#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b31d3d5699f..1a2117043bb 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -1002,7 +1002,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
head = this_cpu_ptr(call->perf_events);
- perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+ perf_trace_buf_submit(entry, size, rctx,
+ entry->ip, 1, regs, head, NULL);
}
/* Kretprobe profile handler */
@@ -1033,7 +1034,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize);
head = this_cpu_ptr(call->perf_events);
- perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
+ perf_trace_buf_submit(entry, size, rctx,
+ entry->ret_ip, 1, regs, head, NULL);
}
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ff791ea48b5..02170c00c41 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -108,7 +108,8 @@ out_enable:
* wakeup uses its own tracer function to keep the overhead down:
*/
static void
-wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
+wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
@@ -129,7 +130,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops trace_ops __read_mostly =
{
.func = wakeup_tracer_call,
- .flags = FTRACE_OPS_FL_GLOBAL,
+ .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
};
#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 288541f977f..2c00a691a54 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -103,54 +103,67 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
static int trace_selftest_test_probe1_cnt;
static void trace_selftest_test_probe1_func(unsigned long ip,
- unsigned long pip)
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
{
trace_selftest_test_probe1_cnt++;
}
static int trace_selftest_test_probe2_cnt;
static void trace_selftest_test_probe2_func(unsigned long ip,
- unsigned long pip)
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
{
trace_selftest_test_probe2_cnt++;
}
static int trace_selftest_test_probe3_cnt;
static void trace_selftest_test_probe3_func(unsigned long ip,
- unsigned long pip)
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
{
trace_selftest_test_probe3_cnt++;
}
static int trace_selftest_test_global_cnt;
static void trace_selftest_test_global_func(unsigned long ip,
- unsigned long pip)
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
{
trace_selftest_test_global_cnt++;
}
static int trace_selftest_test_dyn_cnt;
static void trace_selftest_test_dyn_func(unsigned long ip,
- unsigned long pip)
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
{
trace_selftest_test_dyn_cnt++;
}
static struct ftrace_ops test_probe1 = {
.func = trace_selftest_test_probe1_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static struct ftrace_ops test_probe2 = {
.func = trace_selftest_test_probe2_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static struct ftrace_ops test_probe3 = {
.func = trace_selftest_test_probe3_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static struct ftrace_ops test_global = {
- .func = trace_selftest_test_global_func,
- .flags = FTRACE_OPS_FL_GLOBAL,
+ .func = trace_selftest_test_global_func,
+ .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE,
};
static void print_counts(void)
@@ -393,10 +406,253 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
return ret;
}
+
+static int trace_selftest_recursion_cnt;
+static void trace_selftest_test_recursion_func(unsigned long ip,
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
+{
+ /*
+ * This function is registered without the recursion safe flag.
+ * The ftrace infrastructure should provide the recursion
+ * protection. If not, this will crash the kernel!
+ */
+ trace_selftest_recursion_cnt++;
+ DYN_FTRACE_TEST_NAME();
+}
+
+static void trace_selftest_test_recursion_safe_func(unsigned long ip,
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
+{
+ /*
+ * We said we would provide our own recursion. By calling
+ * this function again, we should recurse back into this function
+ * and count again. But this only happens if the arch supports
+ * all of ftrace features and nothing else is using the function
+ * tracing utility.
+ */
+ if (trace_selftest_recursion_cnt++)
+ return;
+ DYN_FTRACE_TEST_NAME();
+}
+
+static struct ftrace_ops test_rec_probe = {
+ .func = trace_selftest_test_recursion_func,
+};
+
+static struct ftrace_ops test_recsafe_probe = {
+ .func = trace_selftest_test_recursion_safe_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+};
+
+static int
+trace_selftest_function_recursion(void)
+{
+ int save_ftrace_enabled = ftrace_enabled;
+ int save_tracer_enabled = tracer_enabled;
+ char *func_name;
+ int len;
+ int ret;
+ int cnt;
+
+ /* The previous test PASSED */
+ pr_cont("PASSED\n");
+ pr_info("Testing ftrace recursion: ");
+
+
+ /* enable tracing, and record the filter function */
+ ftrace_enabled = 1;
+ tracer_enabled = 1;
+
+ /* Handle PPC64 '.' name */
+ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+ len = strlen(func_name);
+
+ ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1);
+ if (ret) {
+ pr_cont("*Could not set filter* ");
+ goto out;
+ }
+
+ ret = register_ftrace_function(&test_rec_probe);
+ if (ret) {
+ pr_cont("*could not register callback* ");
+ goto out;
+ }
+
+ DYN_FTRACE_TEST_NAME();
+
+ unregister_ftrace_function(&test_rec_probe);
+
+ ret = -1;
+ if (trace_selftest_recursion_cnt != 1) {
+ pr_cont("*callback not called once (%d)* ",
+ trace_selftest_recursion_cnt);
+ goto out;
+ }
+
+ trace_selftest_recursion_cnt = 1;
+
+ pr_cont("PASSED\n");
+ pr_info("Testing ftrace recursion safe: ");
+
+ ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1);
+ if (ret) {
+ pr_cont("*Could not set filter* ");
+ goto out;
+ }
+
+ ret = register_ftrace_function(&test_recsafe_probe);
+ if (ret) {
+ pr_cont("*could not register callback* ");
+ goto out;
+ }
+
+ DYN_FTRACE_TEST_NAME();
+
+ unregister_ftrace_function(&test_recsafe_probe);
+
+ /*
+ * If arch supports all ftrace features, and no other task
+ * was on the list, we should be fine.
+ */
+ if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC)
+ cnt = 2; /* Should have recursed */
+ else
+ cnt = 1;
+
+ ret = -1;
+ if (trace_selftest_recursion_cnt != cnt) {
+ pr_cont("*callback not called expected %d times (%d)* ",
+ cnt, trace_selftest_recursion_cnt);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ ftrace_enabled = save_ftrace_enabled;
+ tracer_enabled = save_tracer_enabled;
+
+ return ret;
+}
#else
# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
+# define trace_selftest_function_recursion() ({ 0; })
#endif /* CONFIG_DYNAMIC_FTRACE */
+static enum {
+ TRACE_SELFTEST_REGS_START,
+ TRACE_SELFTEST_REGS_FOUND,
+ TRACE_SELFTEST_REGS_NOT_FOUND,
+} trace_selftest_regs_stat;
+
+static void trace_selftest_test_regs_func(unsigned long ip,
+ unsigned long pip,
+ struct ftrace_ops *op,
+ struct pt_regs *pt_regs)
+{
+ if (pt_regs)
+ trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND;
+ else
+ trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND;
+}
+
+static struct ftrace_ops test_regs_probe = {
+ .func = trace_selftest_test_regs_func,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS,
+};
+
+static int
+trace_selftest_function_regs(void)
+{
+ int save_ftrace_enabled = ftrace_enabled;
+ int save_tracer_enabled = tracer_enabled;
+ char *func_name;
+ int len;
+ int ret;
+ int supported = 0;
+
+#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS
+ supported = 1;
+#endif
+
+ /* The previous test PASSED */
+ pr_cont("PASSED\n");
+ pr_info("Testing ftrace regs%s: ",
+ !supported ? "(no arch support)" : "");
+
+ /* enable tracing, and record the filter function */
+ ftrace_enabled = 1;
+ tracer_enabled = 1;
+
+ /* Handle PPC64 '.' name */
+ func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+ len = strlen(func_name);
+
+ ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1);
+ /*
+ * If DYNAMIC_FTRACE is not set, then we just trace all functions.
+ * This test really doesn't care.
+ */
+ if (ret && ret != -ENODEV) {
+ pr_cont("*Could not set filter* ");
+ goto out;
+ }
+
+ ret = register_ftrace_function(&test_regs_probe);
+ /*
+ * Now if the arch does not support passing regs, then this should
+ * have failed.
+ */
+ if (!supported) {
+ if (!ret) {
+ pr_cont("*registered save-regs without arch support* ");
+ goto out;
+ }
+ test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED;
+ ret = register_ftrace_function(&test_regs_probe);
+ }
+ if (ret) {
+ pr_cont("*could not register callback* ");
+ goto out;
+ }
+
+
+ DYN_FTRACE_TEST_NAME();
+
+ unregister_ftrace_function(&test_regs_probe);
+
+ ret = -1;
+
+ switch (trace_selftest_regs_stat) {
+ case TRACE_SELFTEST_REGS_START:
+ pr_cont("*callback never called* ");
+ goto out;
+
+ case TRACE_SELFTEST_REGS_FOUND:
+ if (supported)
+ break;
+ pr_cont("*callback received regs without arch support* ");
+ goto out;
+
+ case TRACE_SELFTEST_REGS_NOT_FOUND:
+ if (!supported)
+ break;
+ pr_cont("*callback received NULL regs* ");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ ftrace_enabled = save_ftrace_enabled;
+ tracer_enabled = save_tracer_enabled;
+
+ return ret;
+}
+
/*
* Simple verification test of ftrace function tracer.
* Enable ftrace, sleep 1/10 second, and then read the trace
@@ -442,7 +698,14 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
ret = trace_selftest_startup_dynamic_tracing(trace, tr,
DYN_FTRACE_TEST_NAME);
+ if (ret)
+ goto out;
+ ret = trace_selftest_function_recursion();
+ if (ret)
+ goto out;
+
+ ret = trace_selftest_function_regs();
out:
ftrace_enabled = save_ftrace_enabled;
tracer_enabled = save_tracer_enabled;
@@ -778,6 +1041,8 @@ static int trace_wakeup_test_thread(void *data)
set_current_state(TASK_INTERRUPTIBLE);
schedule();
+ complete(x);
+
/* we are awake, now wait to disappear */
while (!kthread_should_stop()) {
/*
@@ -821,24 +1086,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
/* reset the max latency */
tracing_max_latency = 0;
- /* sleep to let the RT thread sleep too */
- msleep(100);
+ while (p->on_rq) {
+ /*
+ * Sleep to make sure the RT thread is asleep too.
+ * On virtual machines we can't rely on timings,
+ * but we want to make sure this test still works.
+ */
+ msleep(100);
+ }
- /*
- * Yes this is slightly racy. It is possible that for some
- * strange reason that the RT thread we created, did not
- * call schedule for 100ms after doing the completion,
- * and we do a wakeup on a task that already is awake.
- * But that is extremely unlikely, and the worst thing that
- * happens in such a case, is that we disable tracing.
- * Honestly, if this race does happen something is horrible
- * wrong with the system.
- */
+ init_completion(&isrt);
wake_up_process(p);
- /* give a little time to let the thread wake up */
- msleep(100);
+ /* Wait for the task to wake up */
+ wait_for_completion(&isrt);
/* stop the tracing. */
tracing_stop();
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index d4545f49242..0c1b165778e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -111,7 +111,8 @@ static inline void check_stack(void)
}
static void
-stack_trace_call(unsigned long ip, unsigned long parent_ip)
+stack_trace_call(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *pt_regs)
{
int cpu;
@@ -136,6 +137,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops trace_ops __read_mostly =
{
.func = stack_trace_call,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE,
};
static ssize_t
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 96fc7336909..2485a7d09b1 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -487,7 +487,7 @@ int __init init_ftrace_syscalls(void)
return 0;
}
-core_initcall(init_ftrace_syscalls);
+early_initcall(init_ftrace_syscalls);
#ifdef CONFIG_PERF_EVENTS
@@ -506,6 +506,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
int size;
syscall_nr = syscall_get_nr(current, regs);
+ if (syscall_nr < 0)
+ return;
if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
return;
@@ -532,7 +534,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
(unsigned long *)&rec->args);
head = this_cpu_ptr(sys_data->enter_event->perf_events);
- perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
}
int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -580,6 +582,8 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
int size;
syscall_nr = syscall_get_nr(current, regs);
+ if (syscall_nr < 0)
+ return;
if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
return;
@@ -608,7 +612,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->ret = syscall_get_return_value(current, regs);
head = this_cpu_ptr(sys_data->exit_event->perf_events);
- perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
+ perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL);
}
int perf_sysexit_enable(struct ftrace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2b36ac68549..03003cd7dd9 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -670,7 +670,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
head = this_cpu_ptr(call->perf_events);
- perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
+ perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
out:
preempt_enable();
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 69add8a9da6..9d4c8d5a1f5 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -22,6 +22,7 @@
#include <linux/notifier.h>
#include <linux/module.h>
#include <linux/sysctl.h>
+#include <linux/smpboot.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@@ -29,16 +30,18 @@
int watchdog_enabled = 1;
int __read_mostly watchdog_thresh = 10;
+static int __read_mostly watchdog_disabled;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
+static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
+static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
-static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
@@ -248,13 +251,15 @@ static void watchdog_overflow_callback(struct perf_event *event,
__this_cpu_write(hard_watchdog_warn, false);
return;
}
+#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
static void watchdog_interrupt_count(void)
{
__this_cpu_inc(hrtimer_interrupts);
}
-#else
-static inline void watchdog_interrupt_count(void) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+
+static int watchdog_nmi_enable(unsigned int cpu);
+static void watchdog_nmi_disable(unsigned int cpu);
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
@@ -327,49 +332,68 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
}
+static void watchdog_set_prio(unsigned int policy, unsigned int prio)
+{
+ struct sched_param param = { .sched_priority = prio };
-/*
- * The watchdog thread - touches the timestamp.
- */
-static int watchdog(void *unused)
+ sched_setscheduler(current, policy, &param);
+}
+
+static void watchdog_enable(unsigned int cpu)
{
- struct sched_param param = { .sched_priority = 0 };
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
- /* initialize timestamp */
- __touch_watchdog();
+ if (!watchdog_enabled) {
+ kthread_park(current);
+ return;
+ }
+
+ /* Enable the perf event */
+ watchdog_nmi_enable(cpu);
/* kick off the timer for the hardlockup detector */
+ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer->function = watchdog_timer_fn;
+
/* done here because hrtimer_start can only pin to smp_processor_id() */
hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
HRTIMER_MODE_REL_PINNED);
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * Run briefly (kicked by the hrtimer callback function) once every
- * get_sample_period() seconds (4 seconds by default) to reset the
- * softlockup timestamp. If this gets delayed for more than
- * 2*watchdog_thresh seconds then the debug-printout triggers in
- * watchdog_timer_fn().
- */
- while (!kthread_should_stop()) {
- __touch_watchdog();
- schedule();
+ /* initialize timestamp */
+ watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
+ __touch_watchdog();
+}
+
+static void watchdog_disable(unsigned int cpu)
+{
+ struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
- if (kthread_should_stop())
- break;
+ watchdog_set_prio(SCHED_NORMAL, 0);
+ hrtimer_cancel(hrtimer);
+ /* disable the perf event */
+ watchdog_nmi_disable(cpu);
+}
- set_current_state(TASK_INTERRUPTIBLE);
- }
- /*
- * Drop the policy/priority elevation during thread exit to avoid a
- * scheduling latency spike.
- */
- __set_current_state(TASK_RUNNING);
- sched_setscheduler(current, SCHED_NORMAL, &param);
- return 0;
+static int watchdog_should_run(unsigned int cpu)
+{
+ return __this_cpu_read(hrtimer_interrupts) !=
+ __this_cpu_read(soft_lockup_hrtimer_cnt);
}
+/*
+ * The watchdog thread function - touches the timestamp.
+ *
+ * It only runs once every get_sample_period() seconds (4 seconds by
+ * default) to reset the softlockup timestamp. If this gets delayed
+ * for more than 2*watchdog_thresh seconds then the debug-printout
+ * triggers in watchdog_timer_fn().
+ */
+static void watchdog(unsigned int cpu)
+{
+ __this_cpu_write(soft_lockup_hrtimer_cnt,
+ __this_cpu_read(hrtimer_interrupts));
+ __touch_watchdog();
+}
#ifdef CONFIG_HARDLOCKUP_DETECTOR
/*
@@ -379,7 +403,7 @@ static int watchdog(void *unused)
*/
static unsigned long cpu0_err;
-static int watchdog_nmi_enable(int cpu)
+static int watchdog_nmi_enable(unsigned int cpu)
{
struct perf_event_attr *wd_attr;
struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -433,7 +457,7 @@ out:
return 0;
}
-static void watchdog_nmi_disable(int cpu)
+static void watchdog_nmi_disable(unsigned int cpu)
{
struct perf_event *event = per_cpu(watchdog_ev, cpu);
@@ -447,107 +471,35 @@ static void watchdog_nmi_disable(int cpu)
return;
}
#else
-static int watchdog_nmi_enable(int cpu) { return 0; }
-static void watchdog_nmi_disable(int cpu) { return; }
+static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
+static void watchdog_nmi_disable(unsigned int cpu) { return; }
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
/* prepare/enable/disable routines */
-static void watchdog_prepare_cpu(int cpu)
-{
- struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
-
- WARN_ON(per_cpu(softlockup_watchdog, cpu));
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hrtimer->function = watchdog_timer_fn;
-}
-
-static int watchdog_enable(int cpu)
-{
- struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
- int err = 0;
-
- /* enable the perf event */
- err = watchdog_nmi_enable(cpu);
-
- /* Regardless of err above, fall through and start softlockup */
-
- /* create the watchdog thread */
- if (!p) {
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
- p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
- if (IS_ERR(p)) {
- pr_err("softlockup watchdog for %i failed\n", cpu);
- if (!err) {
- /* if hardlockup hasn't already set this */
- err = PTR_ERR(p);
- /* and disable the perf event */
- watchdog_nmi_disable(cpu);
- }
- goto out;
- }
- sched_setscheduler(p, SCHED_FIFO, &param);
- kthread_bind(p, cpu);
- per_cpu(watchdog_touch_ts, cpu) = 0;
- per_cpu(softlockup_watchdog, cpu) = p;
- wake_up_process(p);
- }
-
-out:
- return err;
-}
-
-static void watchdog_disable(int cpu)
-{
- struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
- struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
-
- /*
- * cancel the timer first to stop incrementing the stats
- * and waking up the kthread
- */
- hrtimer_cancel(hrtimer);
-
- /* disable the perf event */
- watchdog_nmi_disable(cpu);
-
- /* stop the watchdog thread */
- if (p) {
- per_cpu(softlockup_watchdog, cpu) = NULL;
- kthread_stop(p);
- }
-}
-
/* sysctl functions */
#ifdef CONFIG_SYSCTL
static void watchdog_enable_all_cpus(void)
{
- int cpu;
-
- watchdog_enabled = 0;
-
- for_each_online_cpu(cpu)
- if (!watchdog_enable(cpu))
- /* if any cpu succeeds, watchdog is considered
- enabled for the system */
- watchdog_enabled = 1;
-
- if (!watchdog_enabled)
- pr_err("failed to be enabled on some cpus\n");
+ unsigned int cpu;
+ if (watchdog_disabled) {
+ watchdog_disabled = 0;
+ for_each_online_cpu(cpu)
+ kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+ }
}
static void watchdog_disable_all_cpus(void)
{
- int cpu;
+ unsigned int cpu;
- for_each_online_cpu(cpu)
- watchdog_disable(cpu);
-
- /* if all watchdogs are disabled, then they are disabled for the system */
- watchdog_enabled = 0;
+ if (!watchdog_disabled) {
+ watchdog_disabled = 1;
+ for_each_online_cpu(cpu)
+ kthread_park(per_cpu(softlockup_watchdog, cpu));
+ }
}
-
/*
* proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
*/
@@ -557,90 +509,36 @@ int proc_dowatchdog(struct ctl_table *table, int write,
{
int ret;
+ if (watchdog_disabled < 0)
+ return -ENODEV;
+
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (ret || !write)
- goto out;
+ return ret;
if (watchdog_enabled && watchdog_thresh)
watchdog_enable_all_cpus();
else
watchdog_disable_all_cpus();
-out:
return ret;
}
#endif /* CONFIG_SYSCTL */
-
-/*
- * Create/destroy watchdog threads as CPUs come and go:
- */
-static int
-cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
- int hotcpu = (unsigned long)hcpu;
-
- switch (action) {
- case CPU_UP_PREPARE:
- case CPU_UP_PREPARE_FROZEN:
- watchdog_prepare_cpu(hotcpu);
- break;
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- if (watchdog_enabled)
- watchdog_enable(hotcpu);
- break;
-#ifdef CONFIG_HOTPLUG_CPU
- case CPU_UP_CANCELED:
- case CPU_UP_CANCELED_FROZEN:
- watchdog_disable(hotcpu);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- watchdog_disable(hotcpu);
- break;
-#endif /* CONFIG_HOTPLUG_CPU */
- }
-
- /*
- * hardlockup and softlockup are not important enough
- * to block cpu bring up. Just always succeed and
- * rely on printk output to flag problems.
- */
- return NOTIFY_OK;
-}
-
-static struct notifier_block cpu_nfb = {
- .notifier_call = cpu_callback
+static struct smp_hotplug_thread watchdog_threads = {
+ .store = &softlockup_watchdog,
+ .thread_should_run = watchdog_should_run,
+ .thread_fn = watchdog,
+ .thread_comm = "watchdog/%u",
+ .setup = watchdog_enable,
+ .park = watchdog_disable,
+ .unpark = watchdog_enable,
};
-#ifdef CONFIG_SUSPEND
-/*
- * On exit from suspend we force an offline->online transition on the boot CPU
- * so that the PMU state that was lost while in suspended state gets set up
- * properly for the boot CPU. This information is required for restarting the
- * NMI watchdog.
- */
-void lockup_detector_bootcpu_resume(void)
-{
- void *cpu = (void *)(long)smp_processor_id();
-
- cpu_callback(&cpu_nfb, CPU_DEAD_FROZEN, cpu);
- cpu_callback(&cpu_nfb, CPU_UP_PREPARE_FROZEN, cpu);
- cpu_callback(&cpu_nfb, CPU_ONLINE_FROZEN, cpu);
-}
-#endif
-
void __init lockup_detector_init(void)
{
- void *cpu = (void *)(long)smp_processor_id();
- int err;
-
- err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
- WARN_ON(notifier_to_errno(err));
-
- cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
- register_cpu_notifier(&cpu_nfb);
-
- return;
+ if (smpboot_register_percpu_thread(&watchdog_threads)) {
+ pr_err("Failed to create watchdog threads, disabled\n");
+ watchdog_disabled = -ENODEV;
+ }
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 692d97628a1..3c5a79e2134 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -66,6 +66,7 @@ enum {
/* pool flags */
POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
+ POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
/* worker flags */
WORKER_STARTED = 1 << 0, /* started */
@@ -652,7 +653,7 @@ static bool need_to_manage_workers(struct worker_pool *pool)
/* Do we have too many workers and should some go away? */
static bool too_many_workers(struct worker_pool *pool)
{
- bool managing = mutex_is_locked(&pool->manager_mutex);
+ bool managing = pool->flags & POOL_MANAGING_WORKERS;
int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
int nr_busy = pool->nr_workers - nr_idle;
@@ -1326,6 +1327,15 @@ static void idle_worker_rebind(struct worker *worker)
/* we did our part, wait for rebind_workers() to finish up */
wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND));
+
+ /*
+ * rebind_workers() shouldn't finish until all workers passed the
+ * above WORKER_REBIND wait. Tell it when done.
+ */
+ spin_lock_irq(&worker->pool->gcwq->lock);
+ if (!--worker->idle_rebind->cnt)
+ complete(&worker->idle_rebind->done);
+ spin_unlock_irq(&worker->pool->gcwq->lock);
}
/*
@@ -1339,8 +1349,16 @@ static void busy_worker_rebind_fn(struct work_struct *work)
struct worker *worker = container_of(work, struct worker, rebind_work);
struct global_cwq *gcwq = worker->pool->gcwq;
- if (worker_maybe_bind_and_lock(worker))
- worker_clr_flags(worker, WORKER_REBIND);
+ worker_maybe_bind_and_lock(worker);
+
+ /*
+ * %WORKER_REBIND must be cleared even if the above binding failed;
+ * otherwise, we may confuse the next CPU_UP cycle or oops / get
+ * stuck by calling idle_worker_rebind() prematurely. If CPU went
+ * down again inbetween, %WORKER_UNBOUND would be set, so clearing
+ * %WORKER_REBIND is always safe.
+ */
+ worker_clr_flags(worker, WORKER_REBIND);
spin_unlock_irq(&gcwq->lock);
}
@@ -1396,12 +1414,15 @@ retry:
/* set REBIND and kick idle ones, we'll wait for these later */
for_each_worker_pool(pool, gcwq) {
list_for_each_entry(worker, &pool->idle_list, entry) {
+ unsigned long worker_flags = worker->flags;
+
if (worker->flags & WORKER_REBIND)
continue;
- /* morph UNBOUND to REBIND */
- worker->flags &= ~WORKER_UNBOUND;
- worker->flags |= WORKER_REBIND;
+ /* morph UNBOUND to REBIND atomically */
+ worker_flags &= ~WORKER_UNBOUND;
+ worker_flags |= WORKER_REBIND;
+ ACCESS_ONCE(worker->flags) = worker_flags;
idle_rebind.cnt++;
worker->idle_rebind = &idle_rebind;
@@ -1419,25 +1440,15 @@ retry:
goto retry;
}
- /*
- * All idle workers are rebound and waiting for %WORKER_REBIND to
- * be cleared inside idle_worker_rebind(). Clear and release.
- * Clearing %WORKER_REBIND from this foreign context is safe
- * because these workers are still guaranteed to be idle.
- */
- for_each_worker_pool(pool, gcwq)
- list_for_each_entry(worker, &pool->idle_list, entry)
- worker->flags &= ~WORKER_REBIND;
-
- wake_up_all(&gcwq->rebind_hold);
-
- /* rebind busy workers */
+ /* all idle workers are rebound, rebind busy workers */
for_each_busy_worker(worker, i, pos, gcwq) {
struct work_struct *rebind_work = &worker->rebind_work;
+ unsigned long worker_flags = worker->flags;
- /* morph UNBOUND to REBIND */
- worker->flags &= ~WORKER_UNBOUND;
- worker->flags |= WORKER_REBIND;
+ /* morph UNBOUND to REBIND atomically */
+ worker_flags &= ~WORKER_UNBOUND;
+ worker_flags |= WORKER_REBIND;
+ ACCESS_ONCE(worker->flags) = worker_flags;
if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
work_data_bits(rebind_work)))
@@ -1449,6 +1460,34 @@ retry:
worker->scheduled.next,
work_color_to_flags(WORK_NO_COLOR));
}
+
+ /*
+ * All idle workers are rebound and waiting for %WORKER_REBIND to
+ * be cleared inside idle_worker_rebind(). Clear and release.
+ * Clearing %WORKER_REBIND from this foreign context is safe
+ * because these workers are still guaranteed to be idle.
+ *
+ * We need to make sure all idle workers passed WORKER_REBIND wait
+ * in idle_worker_rebind() before returning; otherwise, workers can
+ * get stuck at the wait if hotplug cycle repeats.
+ */
+ idle_rebind.cnt = 1;
+ INIT_COMPLETION(idle_rebind.done);
+
+ for_each_worker_pool(pool, gcwq) {
+ list_for_each_entry(worker, &pool->idle_list, entry) {
+ worker->flags &= ~WORKER_REBIND;
+ idle_rebind.cnt++;
+ }
+ }
+
+ wake_up_all(&gcwq->rebind_hold);
+
+ if (--idle_rebind.cnt) {
+ spin_unlock_irq(&gcwq->lock);
+ wait_for_completion(&idle_rebind.done);
+ spin_lock_irq(&gcwq->lock);
+ }
}
static struct worker *alloc_worker(void)
@@ -1794,9 +1833,45 @@ static bool manage_workers(struct worker *worker)
struct worker_pool *pool = worker->pool;
bool ret = false;
- if (!mutex_trylock(&pool->manager_mutex))
+ if (pool->flags & POOL_MANAGING_WORKERS)
return ret;
+ pool->flags |= POOL_MANAGING_WORKERS;
+
+ /*
+ * To simplify both worker management and CPU hotplug, hold off
+ * management while hotplug is in progress. CPU hotplug path can't
+ * grab %POOL_MANAGING_WORKERS to achieve this because that can
+ * lead to idle worker depletion (all become busy thinking someone
+ * else is managing) which in turn can result in deadlock under
+ * extreme circumstances. Use @pool->manager_mutex to synchronize
+ * manager against CPU hotplug.
+ *
+ * manager_mutex would always be free unless CPU hotplug is in
+ * progress. trylock first without dropping @gcwq->lock.
+ */
+ if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
+ spin_unlock_irq(&pool->gcwq->lock);
+ mutex_lock(&pool->manager_mutex);
+ /*
+ * CPU hotplug could have happened while we were waiting
+ * for manager_mutex. Hotplug itself can't handle us
+ * because manager isn't either on idle or busy list, and
+ * @gcwq's state and ours could have deviated.
+ *
+ * As hotplug is now excluded via manager_mutex, we can
+ * simply try to bind. It will succeed or fail depending
+ * on @gcwq's current state. Try it and adjust
+ * %WORKER_UNBOUND accordingly.
+ */
+ if (worker_maybe_bind_and_lock(worker))
+ worker->flags &= ~WORKER_UNBOUND;
+ else
+ worker->flags |= WORKER_UNBOUND;
+
+ ret = true;
+ }
+
pool->flags &= ~POOL_MANAGE_WORKERS;
/*
@@ -1806,6 +1881,7 @@ static bool manage_workers(struct worker *worker)
ret |= maybe_destroy_workers(pool);
ret |= maybe_create_worker(pool);
+ pool->flags &= ~POOL_MANAGING_WORKERS;
mutex_unlock(&pool->manager_mutex);
return ret;
}
@@ -3500,18 +3576,17 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb,
#ifdef CONFIG_SMP
struct work_for_cpu {
- struct completion completion;
+ struct work_struct work;
long (*fn)(void *);
void *arg;
long ret;
};
-static int do_work_for_cpu(void *_wfc)
+static void work_for_cpu_fn(struct work_struct *work)
{
- struct work_for_cpu *wfc = _wfc;
+ struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work);
+
wfc->ret = wfc->fn(wfc->arg);
- complete(&wfc->completion);
- return 0;
}
/**
@@ -3526,19 +3601,11 @@ static int do_work_for_cpu(void *_wfc)
*/
long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
{
- struct task_struct *sub_thread;
- struct work_for_cpu wfc = {
- .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
- .fn = fn,
- .arg = arg,
- };
+ struct work_for_cpu wfc = { .fn = fn, .arg = arg };
- sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
- if (IS_ERR(sub_thread))
- return PTR_ERR(sub_thread);
- kthread_bind(sub_thread, cpu);
- wake_up_process(sub_thread);
- wait_for_completion(&wfc.completion);
+ INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn);
+ schedule_work_on(cpu, &wfc.work);
+ flush_work(&wfc.work);
return wfc.ret;
}
EXPORT_SYMBOL_GPL(work_on_cpu);