summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/extable.c25
-rw-r--r--kernel/lockdep.c16
-rw-r--r--kernel/panic.c115
-rw-r--r--kernel/power/disk.c4
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/rcupdate.c44
-rw-r--r--kernel/sched.c2
-rw-r--r--kernel/slow-work.c640
-rw-r--r--kernel/smp.c432
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/trace/Kconfig9
-rw-r--r--kernel/trace/ftrace.c2
14 files changed, 1028 insertions, 279 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e4791b3ba55..bab1dffe37e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_SLOW_WORK) += slow-work.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8d82b..c46da6a4703 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,6 +41,14 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
return e;
}
+static inline int init_kernel_text(unsigned long addr)
+{
+ if (addr >= (unsigned long)_sinittext &&
+ addr <= (unsigned long)_einittext)
+ return 1;
+ return 0;
+}
+
__notrace_funcgraph int core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
@@ -48,8 +56,7 @@ __notrace_funcgraph int core_kernel_text(unsigned long addr)
return 1;
if (system_state == SYSTEM_BOOTING &&
- addr >= (unsigned long)_sinittext &&
- addr <= (unsigned long)_einittext)
+ init_kernel_text(addr))
return 1;
return 0;
}
@@ -58,7 +65,19 @@ __notrace_funcgraph int __kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
- return __module_text_address(addr) != NULL;
+ if (__module_text_address(addr))
+ return 1;
+ /*
+ * There might be init symbols in saved stacktraces.
+ * Give those symbols a chance to be printed in
+ * backtraces (such as lockdep traces).
+ *
+ * Since we are after the module-symbols check, there's
+ * no danger of address overlap:
+ */
+ if (init_kernel_text(addr))
+ return 1;
+ return 0;
}
int kernel_text_address(unsigned long addr)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3673a3f44d9..981cd485428 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -433,13 +433,6 @@ atomic_t nr_find_usage_forwards_checks;
atomic_t nr_find_usage_forwards_recursions;
atomic_t nr_find_usage_backwards_checks;
atomic_t nr_find_usage_backwards_recursions;
-# define debug_atomic_inc(ptr) atomic_inc(ptr)
-# define debug_atomic_dec(ptr) atomic_dec(ptr)
-# define debug_atomic_read(ptr) atomic_read(ptr)
-#else
-# define debug_atomic_inc(ptr) do { } while (0)
-# define debug_atomic_dec(ptr) do { } while (0)
-# define debug_atomic_read(ptr) 0
#endif
/*
@@ -1900,9 +1893,9 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
curr->comm, task_pid_nr(curr));
print_lock(this);
if (forwards)
- printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
+ printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
else
- printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass);
+ printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
print_lock_name(other);
printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
@@ -2015,7 +2008,8 @@ typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
enum lock_usage_bit bit, const char *name);
static int
-mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
+mark_lock_irq(struct task_struct *curr, struct held_lock *this,
+ enum lock_usage_bit new_bit)
{
int excl_bit = exclusive_bit(new_bit);
int read = new_bit & 1;
@@ -2043,7 +2037,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
* states.
*/
if ((!read || !dir || STRICT_READ_CHECKS) &&
- !usage(curr, this, excl_bit, state_name(new_bit)))
+ !usage(curr, this, excl_bit, state_name(new_bit & ~1)))
return 0;
/*
diff --git a/kernel/panic.c b/kernel/panic.c
index 32fe4eff1b8..3fd8c5bf8b3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -8,19 +8,19 @@
* This function is used through-out the kernel (including mm and fs)
* to indicate a major problem.
*/
+#include <linux/debug_locks.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/delay.h>
+#include <linux/random.h>
#include <linux/reboot.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
#include <linux/sysrq.h>
-#include <linux/interrupt.h>
+#include <linux/init.h>
#include <linux/nmi.h>
-#include <linux/kexec.h>
-#include <linux/debug_locks.h>
-#include <linux/random.h>
-#include <linux/kallsyms.h>
#include <linux/dmi.h>
int panic_on_oops;
@@ -52,19 +52,15 @@ EXPORT_SYMBOL(panic_blink);
*
* This function never returns.
*/
-
NORET_TYPE void panic(const char * fmt, ...)
{
- long i;
static char buf[1024];
va_list args;
-#if defined(CONFIG_S390)
- unsigned long caller = (unsigned long) __builtin_return_address(0);
-#endif
+ long i;
/*
- * It's possible to come here directly from a panic-assertion and not
- * have preempt disabled. Some functions called from here want
+ * It's possible to come here directly from a panic-assertion and
+ * not have preempt disabled. Some functions called from here want
* preempt to be disabled. No point enabling it later though...
*/
preempt_disable();
@@ -77,7 +73,6 @@ NORET_TYPE void panic(const char * fmt, ...)
#ifdef CONFIG_DEBUG_BUGVERBOSE
dump_stack();
#endif
- bust_spinlocks(0);
/*
* If we have crashed and we have a crash kernel loaded let it handle
@@ -86,14 +81,12 @@ NORET_TYPE void panic(const char * fmt, ...)
*/
crash_kexec(NULL);
-#ifdef CONFIG_SMP
/*
* Note smp_send_stop is the usual smp shutdown function, which
* unfortunately means it may not be hardened to work in a panic
* situation.
*/
smp_send_stop();
-#endif
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
@@ -102,19 +95,21 @@ NORET_TYPE void panic(const char * fmt, ...)
if (panic_timeout > 0) {
/*
- * Delay timeout seconds before rebooting the machine.
- * We can't use the "normal" timers since we just panicked..
- */
- printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
+ * Delay timeout seconds before rebooting the machine.
+ * We can't use the "normal" timers since we just panicked.
+ */
+ printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
+
for (i = 0; i < panic_timeout*1000; ) {
touch_nmi_watchdog();
i += panic_blink(i);
mdelay(1);
i++;
}
- /* This will not be a clean reboot, with everything
- * shutting down. But if there is a chance of
- * rebooting the system it will be rebooted.
+ /*
+ * This will not be a clean reboot, with everything
+ * shutting down. But if there is a chance of
+ * rebooting the system it will be rebooted.
*/
emergency_restart();
}
@@ -127,38 +122,44 @@ NORET_TYPE void panic(const char * fmt, ...)
}
#endif
#if defined(CONFIG_S390)
- disabled_wait(caller);
+ {
+ unsigned long caller;
+
+ caller = (unsigned long)__builtin_return_address(0);
+ disabled_wait(caller);
+ }
#endif
local_irq_enable();
- for (i = 0;;) {
+ for (i = 0; ; ) {
touch_softlockup_watchdog();
i += panic_blink(i);
mdelay(1);
i++;
}
+ bust_spinlocks(0);
}
EXPORT_SYMBOL(panic);
struct tnt {
- u8 bit;
- char true;
- char false;
+ u8 bit;
+ char true;
+ char false;
};
static const struct tnt tnts[] = {
- { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
- { TAINT_FORCED_MODULE, 'F', ' ' },
- { TAINT_UNSAFE_SMP, 'S', ' ' },
- { TAINT_FORCED_RMMOD, 'R', ' ' },
- { TAINT_MACHINE_CHECK, 'M', ' ' },
- { TAINT_BAD_PAGE, 'B', ' ' },
- { TAINT_USER, 'U', ' ' },
- { TAINT_DIE, 'D', ' ' },
- { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
- { TAINT_WARN, 'W', ' ' },
- { TAINT_CRAP, 'C', ' ' },
+ { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
+ { TAINT_FORCED_MODULE, 'F', ' ' },
+ { TAINT_UNSAFE_SMP, 'S', ' ' },
+ { TAINT_FORCED_RMMOD, 'R', ' ' },
+ { TAINT_MACHINE_CHECK, 'M', ' ' },
+ { TAINT_BAD_PAGE, 'B', ' ' },
+ { TAINT_USER, 'U', ' ' },
+ { TAINT_DIE, 'D', ' ' },
+ { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
+ { TAINT_WARN, 'W', ' ' },
+ { TAINT_CRAP, 'C', ' ' },
};
/**
@@ -195,7 +196,8 @@ const char *print_tainted(void)
*s = 0;
} else
snprintf(buf, sizeof(buf), "Not tainted");
- return(buf);
+
+ return buf;
}
int test_taint(unsigned flag)
@@ -211,7 +213,8 @@ unsigned long get_taint(void)
void add_taint(unsigned flag)
{
- debug_locks = 0; /* can't trust the integrity of the kernel anymore */
+ /* can't trust the integrity of the kernel anymore: */
+ debug_locks = 0;
set_bit(flag, &tainted_mask);
}
EXPORT_SYMBOL(add_taint);
@@ -266,8 +269,8 @@ static void do_oops_enter_exit(void)
}
/*
- * Return true if the calling CPU is allowed to print oops-related info. This
- * is a bit racy..
+ * Return true if the calling CPU is allowed to print oops-related info.
+ * This is a bit racy..
*/
int oops_may_print(void)
{
@@ -276,20 +279,22 @@ int oops_may_print(void)
/*
* Called when the architecture enters its oops handler, before it prints
- * anything. If this is the first CPU to oops, and it's oopsing the first time
- * then let it proceed.
+ * anything. If this is the first CPU to oops, and it's oopsing the first
+ * time then let it proceed.
*
- * This is all enabled by the pause_on_oops kernel boot option. We do all this
- * to ensure that oopses don't scroll off the screen. It has the side-effect
- * of preventing later-oopsing CPUs from mucking up the display, too.
+ * This is all enabled by the pause_on_oops kernel boot option. We do all
+ * this to ensure that oopses don't scroll off the screen. It has the
+ * side-effect of preventing later-oopsing CPUs from mucking up the display,
+ * too.
*
- * It turns out that the CPU which is allowed to print ends up pausing for the
- * right duration, whereas all the other CPUs pause for twice as long: once in
- * oops_enter(), once in oops_exit().
+ * It turns out that the CPU which is allowed to print ends up pausing for
+ * the right duration, whereas all the other CPUs pause for twice as long:
+ * once in oops_enter(), once in oops_exit().
*/
void oops_enter(void)
{
- debug_locks_off(); /* can't trust the integrity of the kernel anymore */
+ /* can't trust the integrity of the kernel anymore: */
+ debug_locks_off();
do_oops_enter_exit();
}
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f3db382c2b2..5f21ab2bbcd 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -289,7 +289,7 @@ static int create_image(int platform_mode)
* hibernation_snapshot - quiesce devices and create the hibernation
* snapshot image.
* @platform_mode - if set, use the platform driver, if available, to
- * prepare the platform frimware for the power transition.
+ * prepare the platform firmware for the power transition.
*
* Must be called with pm_mutex held
*/
@@ -412,7 +412,7 @@ static int resume_target_kernel(bool platform_mode)
* hibernation_restore - quiesce devices and restore the hibernation
* snapshot image. If successful, control returns in hibernation_snaphot()
* @platform_mode - if set, use the platform driver, if available, to
- * prepare the platform frimware for the transition.
+ * prepare the platform firmware for the transition.
*
* Must be called with pm_mutex held
*/
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 5105f5a6a2c..aaad0ec3419 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -687,8 +687,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
goto out_put_task_struct;
ret = arch_ptrace(child, request, addr, data);
- if (ret < 0)
- goto out_put_task_struct;
out_put_task_struct:
put_task_struct(child);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index cae8a059cf4..2c7b8457d0d 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -122,6 +122,8 @@ static void rcu_barrier_func(void *type)
}
}
+static inline void wait_migrated_callbacks(void);
+
/*
* Orchestrate the specified type of RCU barrier, waiting for all
* RCU callbacks of the specified type to complete.
@@ -147,6 +149,7 @@ static void _rcu_barrier(enum rcu_barrier type)
complete(&rcu_barrier_completion);
wait_for_completion(&rcu_barrier_completion);
mutex_unlock(&rcu_barrier_mutex);
+ wait_migrated_callbacks();
}
/**
@@ -176,9 +179,50 @@ void rcu_barrier_sched(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_sched);
+static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
+static struct rcu_head rcu_migrate_head[3];
+static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
+
+static void rcu_migrate_callback(struct rcu_head *notused)
+{
+ if (atomic_dec_and_test(&rcu_migrate_type_count))
+ wake_up(&rcu_migrate_wq);
+}
+
+static inline void wait_migrated_callbacks(void)
+{
+ wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
+}
+
+static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ if (action == CPU_DYING) {
+ /*
+ * preempt_disable() in on_each_cpu() prevents stop_machine(),
+ * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
+ * returns, all online cpus have queued rcu_barrier_func(),
+ * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
+ *
+ * These callbacks ensure _rcu_barrier() waits for all
+ * RCU callbacks of the specified type to complete.
+ */
+ atomic_set(&rcu_migrate_type_count, 3);
+ call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
+ call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
+ call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
+ } else if (action == CPU_POST_DEAD) {
+ /* rcu_migrate_head is protected by cpu_add_remove_lock */
+ wait_migrated_callbacks();
+ }
+
+ return NOTIFY_OK;
+}
+
void __init rcu_init(void)
{
__rcu_init();
+ hotcpu_notifier(rcu_barrier_cpu_hotplug, 0);
}
void rcu_scheduler_starting(void)
diff --git a/kernel/sched.c b/kernel/sched.c
index 73513f4e19d..2325db2be31 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1110,7 +1110,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
if (rq == this_rq()) {
hrtimer_restart(timer);
} else if (!rq->hrtick_csd_pending) {
- __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+ __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
rq->hrtick_csd_pending = 1;
}
}
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
new file mode 100644
index 00000000000..cf2bc01186e
--- /dev/null
+++ b/kernel/slow-work.c
@@ -0,0 +1,640 @@
+/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ *
+ * See Documentation/slow-work.txt
+ */
+
+#include <linux/module.h>
+#include <linux/slow-work.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+
+#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
+ * things to do */
+#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
+ * OOM */
+
+static void slow_work_cull_timeout(unsigned long);
+static void slow_work_oom_timeout(unsigned long);
+
+#ifdef CONFIG_SYSCTL
+static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
+ void __user *, size_t *, loff_t *);
+
+static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
+ void __user *, size_t *, loff_t *);
+#endif
+
+/*
+ * The pool of threads has at least min threads in it as long as someone is
+ * using the facility, and may have as many as max.
+ *
+ * A portion of the pool may be processing very slow operations.
+ */
+static unsigned slow_work_min_threads = 2;
+static unsigned slow_work_max_threads = 4;
+static unsigned vslow_work_proportion = 50; /* % of threads that may process
+ * very slow work */
+
+#ifdef CONFIG_SYSCTL
+static const int slow_work_min_min_threads = 2;
+static int slow_work_max_max_threads = 255;
+static const int slow_work_min_vslow = 1;
+static const int slow_work_max_vslow = 99;
+
+ctl_table slow_work_sysctls[] = {
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "min-threads",
+ .data = &slow_work_min_threads,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = slow_work_min_threads_sysctl,
+ .extra1 = (void *) &slow_work_min_min_threads,
+ .extra2 = &slow_work_max_threads,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "max-threads",
+ .data = &slow_work_max_threads,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = slow_work_max_threads_sysctl,
+ .extra1 = &slow_work_min_threads,
+ .extra2 = (void *) &slow_work_max_max_threads,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "vslow-percentage",
+ .data = &vslow_work_proportion,
+ .maxlen = sizeof(unsigned),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = (void *) &slow_work_min_vslow,
+ .extra2 = (void *) &slow_work_max_vslow,
+ },
+ { .ctl_name = 0 }
+};
+#endif
+
+/*
+ * The active state of the thread pool
+ */
+static atomic_t slow_work_thread_count;
+static atomic_t vslow_work_executing_count;
+
+static bool slow_work_may_not_start_new_thread;
+static bool slow_work_cull; /* cull a thread due to lack of activity */
+static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
+static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
+static struct slow_work slow_work_new_thread; /* new thread starter */
+
+/*
+ * The queues of work items and the lock governing access to them. These are
+ * shared between all the CPUs. It doesn't make sense to have per-CPU queues
+ * as the number of threads bears no relation to the number of CPUs.
+ *
+ * There are two queues of work items: one for slow work items, and one for
+ * very slow work items.
+ */
+static LIST_HEAD(slow_work_queue);
+static LIST_HEAD(vslow_work_queue);
+static DEFINE_SPINLOCK(slow_work_queue_lock);
+
+/*
+ * The thread controls. A variable used to signal to the threads that they
+ * should exit when the queue is empty, a waitqueue used by the threads to wait
+ * for signals, and a completion set by the last thread to exit.
+ */
+static bool slow_work_threads_should_exit;
+static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
+static DECLARE_COMPLETION(slow_work_last_thread_exited);
+
+/*
+ * The number of users of the thread pool and its lock. Whilst this is zero we
+ * have no threads hanging around, and when this reaches zero, we wait for all
+ * active or queued work items to complete and kill all the threads we do have.
+ */
+static int slow_work_user_count;
+static DEFINE_MUTEX(slow_work_user_lock);
+
+/*
+ * Calculate the maximum number of active threads in the pool that are
+ * permitted to process very slow work items.
+ *
+ * The answer is rounded up to at least 1, but may not equal or exceed the
+ * maximum number of the threads in the pool. This means we always have at
+ * least one thread that can process slow work items, and we always have at
+ * least one thread that won't get tied up doing so.
+ */
+static unsigned slow_work_calc_vsmax(void)
+{
+ unsigned vsmax;
+
+ vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
+ vsmax /= 100;
+ vsmax = max(vsmax, 1U);
+ return min(vsmax, slow_work_max_threads - 1);
+}
+
+/*
+ * Attempt to execute stuff queued on a slow thread. Return true if we managed
+ * it, false if there was nothing to do.
+ */
+static bool slow_work_execute(void)
+{
+ struct slow_work *work = NULL;
+ unsigned vsmax;
+ bool very_slow;
+
+ vsmax = slow_work_calc_vsmax();
+
+ /* see if we can schedule a new thread to be started if we're not
+ * keeping up with the work */
+ if (!waitqueue_active(&slow_work_thread_wq) &&
+ (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
+ atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
+ !slow_work_may_not_start_new_thread)
+ slow_work_enqueue(&slow_work_new_thread);
+
+ /* find something to execute */
+ spin_lock_irq(&slow_work_queue_lock);
+ if (!list_empty(&vslow_work_queue) &&
+ atomic_read(&vslow_work_executing_count) < vsmax) {
+ work = list_entry(vslow_work_queue.next,
+ struct slow_work, link);
+ if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+ BUG();
+ list_del_init(&work->link);
+ atomic_inc(&vslow_work_executing_count);
+ very_slow = true;
+ } else if (!list_empty(&slow_work_queue)) {
+ work = list_entry(slow_work_queue.next,
+ struct slow_work, link);
+ if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+ BUG();
+ list_del_init(&work->link);
+ very_slow = false;
+ } else {
+ very_slow = false; /* avoid the compiler warning */
+ }
+ spin_unlock_irq(&slow_work_queue_lock);
+
+ if (!work)
+ return false;
+
+ if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
+ BUG();
+
+ work->ops->execute(work);
+
+ if (very_slow)
+ atomic_dec(&vslow_work_executing_count);
+ clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
+
+ /* if someone tried to enqueue the item whilst we were executing it,
+ * then it'll be left unenqueued to avoid multiple threads trying to
+ * execute it simultaneously
+ *
+ * there is, however, a race between us testing the pending flag and
+ * getting the spinlock, and between the enqueuer setting the pending
+ * flag and getting the spinlock, so we use a deferral bit to tell us
+ * if the enqueuer got there first
+ */
+ if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
+ spin_lock_irq(&slow_work_queue_lock);
+
+ if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
+ test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
+ goto auto_requeue;
+
+ spin_unlock_irq(&slow_work_queue_lock);
+ }
+
+ work->ops->put_ref(work);
+ return true;
+
+auto_requeue:
+ /* we must complete the enqueue operation
+ * - we transfer our ref on the item back to the appropriate queue
+ * - don't wake another thread up as we're awake already
+ */
+ if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+ list_add_tail(&work->link, &vslow_work_queue);
+ else
+ list_add_tail(&work->link, &slow_work_queue);
+ spin_unlock_irq(&slow_work_queue_lock);
+ return true;
+}
+
+/**
+ * slow_work_enqueue - Schedule a slow work item for processing
+ * @work: The work item to queue
+ *
+ * Schedule a slow work item for processing. If the item is already undergoing
+ * execution, this guarantees not to re-enter the execution routine until the
+ * first execution finishes.
+ *
+ * The item is pinned by this function as it retains a reference to it, managed
+ * through the item operations. The item is unpinned once it has been
+ * executed.
+ *
+ * An item may hog the thread that is running it for a relatively large amount
+ * of time, sufficient, for example, to perform several lookup, mkdir, create
+ * and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
+ *
+ * Conversely, if a number of items are awaiting processing, it may take some
+ * time before any given item is given attention. The number of threads in the
+ * pool may be increased to deal with demand, but only up to a limit.
+ *
+ * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
+ * the very slow queue, from which only a portion of the threads will be
+ * allowed to pick items to execute. This ensures that very slow items won't
+ * overly block ones that are just ordinarily slow.
+ *
+ * Returns 0 if successful, -EAGAIN if not.
+ */
+int slow_work_enqueue(struct slow_work *work)
+{
+ unsigned long flags;
+
+ BUG_ON(slow_work_user_count <= 0);
+ BUG_ON(!work);
+ BUG_ON(!work->ops);
+ BUG_ON(!work->ops->get_ref);
+
+ /* when honouring an enqueue request, we only promise that we will run
+ * the work function in the future; we do not promise to run it once
+ * per enqueue request
+ *
+ * we use the PENDING bit to merge together repeat requests without
+ * having to disable IRQs and take the spinlock, whilst still
+ * maintaining our promise
+ */
+ if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
+ spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+ /* we promise that we will not attempt to execute the work
+ * function in more than one thread simultaneously
+ *
+ * this, however, leaves us with a problem if we're asked to
+ * enqueue the work whilst someone is executing the work
+ * function as simply queueing the work immediately means that
+ * another thread may try executing it whilst it is already
+ * under execution
+ *
+ * to deal with this, we set the ENQ_DEFERRED bit instead of
+ * enqueueing, and the thread currently executing the work
+ * function will enqueue the work item when the work function
+ * returns and it has cleared the EXECUTING bit
+ */
+ if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
+ set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
+ } else {
+ if (work->ops->get_ref(work) < 0)
+ goto cant_get_ref;
+ if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+ list_add_tail(&work->link, &vslow_work_queue);
+ else
+ list_add_tail(&work->link, &slow_work_queue);
+ wake_up(&slow_work_thread_wq);
+ }
+
+ spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+ }
+ return 0;
+
+cant_get_ref:
+ spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+ return -EAGAIN;
+}
+EXPORT_SYMBOL(slow_work_enqueue);
+
+/*
+ * Worker thread culling algorithm
+ */
+static bool slow_work_cull_thread(void)
+{
+ unsigned long flags;
+ bool do_cull = false;
+
+ spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+ if (slow_work_cull) {
+ slow_work_cull = false;
+
+ if (list_empty(&slow_work_queue) &&
+ list_empty(&vslow_work_queue) &&
+ atomic_read(&slow_work_thread_count) >
+ slow_work_min_threads) {
+ mod_timer(&slow_work_cull_timer,
+ jiffies + SLOW_WORK_CULL_TIMEOUT);
+ do_cull = true;
+ }
+ }
+
+ spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+ return do_cull;
+}
+
+/*
+ * Determine if there is slow work available for dispatch
+ */
+static inline bool slow_work_available(int vsmax)
+{
+ return !list_empty(&slow_work_queue) ||
+ (!list_empty(&vslow_work_queue) &&
+ atomic_read(&vslow_work_executing_count) < vsmax);
+}
+
+/*
+ * Worker thread dispatcher
+ */
+static int slow_work_thread(void *_data)
+{
+ int vsmax;
+
+ DEFINE_WAIT(wait);
+
+ set_freezable();
+ set_user_nice(current, -5);
+
+ for (;;) {
+ vsmax = vslow_work_proportion;
+ vsmax *= atomic_read(&slow_work_thread_count);
+ vsmax /= 100;
+
+ prepare_to_wait(&slow_work_thread_wq, &wait,
+ TASK_INTERRUPTIBLE);
+ if (!freezing(current) &&
+ !slow_work_threads_should_exit &&
+ !slow_work_available(vsmax) &&
+ !slow_work_cull)
+ schedule();
+ finish_wait(&slow_work_thread_wq, &wait);
+
+ try_to_freeze();
+
+ vsmax = vslow_work_proportion;
+ vsmax *= atomic_read(&slow_work_thread_count);
+ vsmax /= 100;
+
+ if (slow_work_available(vsmax) && slow_work_execute()) {
+ cond_resched();
+ if (list_empty(&slow_work_queue) &&
+ list_empty(&vslow_work_queue) &&
+ atomic_read(&slow_work_thread_count) >
+ slow_work_min_threads)
+ mod_timer(&slow_work_cull_timer,
+ jiffies + SLOW_WORK_CULL_TIMEOUT);
+ continue;
+ }
+
+ if (slow_work_threads_should_exit)
+ break;
+
+ if (slow_work_cull && slow_work_cull_thread())
+ break;
+ }
+
+ if (atomic_dec_and_test(&slow_work_thread_count))
+ complete_and_exit(&slow_work_last_thread_exited, 0);
+ return 0;
+}
+
+/*
+ * Handle thread cull timer expiration
+ */
+static void slow_work_cull_timeout(unsigned long data)
+{
+ slow_work_cull = true;
+ wake_up(&slow_work_thread_wq);
+}
+
+/*
+ * Get a reference on slow work thread starter
+ */
+static int slow_work_new_thread_get_ref(struct slow_work *work)
+{
+ return 0;
+}
+
+/*
+ * Drop a reference on slow work thread starter
+ */
+static void slow_work_new_thread_put_ref(struct slow_work *work)
+{
+}
+
+/*
+ * Start a new slow work thread
+ */
+static void slow_work_new_thread_execute(struct slow_work *work)
+{
+ struct task_struct *p;
+
+ if (slow_work_threads_should_exit)
+ return;
+
+ if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
+ return;
+
+ if (!mutex_trylock(&slow_work_user_lock))
+ return;
+
+ slow_work_may_not_start_new_thread = true;
+ atomic_inc(&slow_work_thread_count);
+ p = kthread_run(slow_work_thread, NULL, "kslowd");
+ if (IS_ERR(p)) {
+ printk(KERN_DEBUG "Slow work thread pool: OOM\n");
+ if (atomic_dec_and_test(&slow_work_thread_count))
+ BUG(); /* we're running on a slow work thread... */
+ mod_timer(&slow_work_oom_timer,
+ jiffies + SLOW_WORK_OOM_TIMEOUT);
+ } else {
+ /* ratelimit the starting of new threads */
+ mod_timer(&slow_work_oom_timer, jiffies + 1);
+ }
+
+ mutex_unlock(&slow_work_user_lock);
+}
+
+static const struct slow_work_ops slow_work_new_thread_ops = {
+ .get_ref = slow_work_new_thread_get_ref,
+ .put_ref = slow_work_new_thread_put_ref,
+ .execute = slow_work_new_thread_execute,
+};
+
+/*
+ * post-OOM new thread start suppression expiration
+ */
+static void slow_work_oom_timeout(unsigned long data)
+{
+ slow_work_may_not_start_new_thread = false;
+}
+
+#ifdef CONFIG_SYSCTL
+/*
+ * Handle adjustment of the minimum number of threads
+ */
+static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ int n;
+
+ if (ret == 0) {
+ mutex_lock(&slow_work_user_lock);
+ if (slow_work_user_count > 0) {
+ /* see if we need to start or stop threads */
+ n = atomic_read(&slow_work_thread_count) -
+ slow_work_min_threads;
+
+ if (n < 0 && !slow_work_may_not_start_new_thread)
+ slow_work_enqueue(&slow_work_new_thread);
+ else if (n > 0)
+ mod_timer(&slow_work_cull_timer,
+ jiffies + SLOW_WORK_CULL_TIMEOUT);
+ }
+ mutex_unlock(&slow_work_user_lock);
+ }
+
+ return ret;
+}
+
+/*
+ * Handle adjustment of the maximum number of threads
+ */
+static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+ int n;
+
+ if (ret == 0) {
+ mutex_lock(&slow_work_user_lock);
+ if (slow_work_user_count > 0) {
+ /* see if we need to stop threads */
+ n = slow_work_max_threads -
+ atomic_read(&slow_work_thread_count);
+
+ if (n < 0)
+ mod_timer(&slow_work_cull_timer,
+ jiffies + SLOW_WORK_CULL_TIMEOUT);
+ }
+ mutex_unlock(&slow_work_user_lock);
+ }
+
+ return ret;
+}
+#endif /* CONFIG_SYSCTL */
+
+/**
+ * slow_work_register_user - Register a user of the facility
+ *
+ * Register a user of the facility, starting up the initial threads if there
+ * aren't any other users at this point. This will return 0 if successful, or
+ * an error if not.
+ */
+int slow_work_register_user(void)
+{
+ struct task_struct *p;
+ int loop;
+
+ mutex_lock(&slow_work_user_lock);
+
+ if (slow_work_user_count == 0) {
+ printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
+ init_completion(&slow_work_last_thread_exited);
+
+ slow_work_threads_should_exit = false;
+ slow_work_init(&slow_work_new_thread,
+ &slow_work_new_thread_ops);
+ slow_work_may_not_start_new_thread = false;
+ slow_work_cull = false;
+
+ /* start the minimum number of threads */
+ for (loop = 0; loop < slow_work_min_threads; loop++) {
+ atomic_inc(&slow_work_thread_count);
+ p = kthread_run(slow_work_thread, NULL, "kslowd");
+ if (IS_ERR(p))
+ goto error;
+ }
+ printk(KERN_NOTICE "Slow work thread pool: Ready\n");
+ }
+
+ slow_work_user_count++;
+ mutex_unlock(&slow_work_user_lock);
+ return 0;
+
+error:
+ if (atomic_dec_and_test(&slow_work_thread_count))
+ complete(&slow_work_last_thread_exited);
+ if (loop > 0) {
+ printk(KERN_ERR "Slow work thread pool:"
+ " Aborting startup on ENOMEM\n");
+ slow_work_threads_should_exit = true;
+ wake_up_all(&slow_work_thread_wq);
+ wait_for_completion(&slow_work_last_thread_exited);
+ printk(KERN_ERR "Slow work thread pool: Aborted\n");
+ }
+ mutex_unlock(&slow_work_user_lock);
+ return PTR_ERR(p);
+}
+EXPORT_SYMBOL(slow_work_register_user);
+
+/**
+ * slow_work_unregister_user - Unregister a user of the facility
+ *
+ * Unregister a user of the facility, killing all the threads if this was the
+ * last one.
+ */
+void slow_work_unregister_user(void)
+{
+ mutex_lock(&slow_work_user_lock);
+
+ BUG_ON(slow_work_user_count <= 0);
+
+ slow_work_user_count--;
+ if (slow_work_user_count == 0) {
+ printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
+ slow_work_threads_should_exit = true;
+ wake_up_all(&slow_work_thread_wq);
+ wait_for_completion(&slow_work_last_thread_exited);
+ printk(KERN_NOTICE "Slow work thread pool:"
+ " Shut down complete\n");
+ }
+
+ del_timer_sync(&slow_work_cull_timer);
+
+ mutex_unlock(&slow_work_user_lock);
+}
+EXPORT_SYMBOL(slow_work_unregister_user);
+
+/*
+ * Initialise the slow work facility
+ */
+static int __init init_slow_work(void)
+{
+ unsigned nr_cpus = num_possible_cpus();
+
+ if (slow_work_max_threads < nr_cpus)
+ slow_work_max_threads = nr_cpus;
+#ifdef CONFIG_SYSCTL
+ if (slow_work_max_max_threads < nr_cpus * 2)
+ slow_work_max_max_threads = nr_cpus * 2;
+#endif
+ return 0;
+}
+
+subsys_initcall(init_slow_work);
diff --git a/kernel/smp.c b/kernel/smp.c
index bbedbb7efe3..858baac568e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -2,40 +2,82 @@
* Generic helpers for smp ipi calls
*
* (C) Jens Axboe <jens.axboe@oracle.com> 2008
- *
*/
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
#include <linux/smp.h>
+#include <linux/cpu.h>
static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
-static LIST_HEAD(call_function_queue);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
+
+static struct {
+ struct list_head queue;
+ spinlock_t lock;
+} call_function __cacheline_aligned_in_smp =
+ {
+ .queue = LIST_HEAD_INIT(call_function.queue),
+ .lock = __SPIN_LOCK_UNLOCKED(call_function.lock),
+ };
enum {
- CSD_FLAG_WAIT = 0x01,
- CSD_FLAG_ALLOC = 0x02,
- CSD_FLAG_LOCK = 0x04,
+ CSD_FLAG_LOCK = 0x01,
};
struct call_function_data {
- struct call_single_data csd;
- spinlock_t lock;
- unsigned int refs;
- struct rcu_head rcu_head;
- unsigned long cpumask_bits[];
+ struct call_single_data csd;
+ spinlock_t lock;
+ unsigned int refs;
+ cpumask_var_t cpumask;
};
struct call_single_queue {
- struct list_head list;
- spinlock_t lock;
+ struct list_head list;
+ spinlock_t lock;
+};
+
+static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
+ .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
+};
+
+static int
+hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ long cpu = (long)hcpu;
+ struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+ if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
+ cpu_to_node(cpu)))
+ return NOTIFY_BAD;
+ break;
+
+#ifdef CONFIG_CPU_HOTPLUG
+ case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
+
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ free_cpumask_var(cfd->cpumask);
+ break;
+#endif
+ };
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
+ .notifier_call = hotplug_cfd,
};
static int __cpuinit init_call_single_data(void)
{
+ void *cpu = (void *)(long)smp_processor_id();
int i;
for_each_possible_cpu(i) {
@@ -44,29 +86,63 @@ static int __cpuinit init_call_single_data(void)
spin_lock_init(&q->lock);
INIT_LIST_HEAD(&q->list);
}
+
+ hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
+ register_cpu_notifier(&hotplug_cfd_notifier);
+
return 0;
}
early_initcall(init_call_single_data);
-static void csd_flag_wait(struct call_single_data *data)
+/*
+ * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
+ *
+ * For non-synchronous ipi calls the csd can still be in use by the
+ * previous function call. For multi-cpu calls its even more interesting
+ * as we'll have to ensure no other cpu is observing our csd.
+ */
+static void csd_lock_wait(struct call_single_data *data)
{
- /* Wait for response */
- do {
- if (!(data->flags & CSD_FLAG_WAIT))
- break;
+ while (data->flags & CSD_FLAG_LOCK)
cpu_relax();
- } while (1);
+}
+
+static void csd_lock(struct call_single_data *data)
+{
+ csd_lock_wait(data);
+ data->flags = CSD_FLAG_LOCK;
+
+ /*
+ * prevent CPU from reordering the above assignment
+ * to ->flags with any subsequent assignments to other
+ * fields of the specified call_single_data structure:
+ */
+ smp_mb();
+}
+
+static void csd_unlock(struct call_single_data *data)
+{
+ WARN_ON(!(data->flags & CSD_FLAG_LOCK));
+
+ /*
+ * ensure we're all done before releasing data:
+ */
+ smp_mb();
+
+ data->flags &= ~CSD_FLAG_LOCK;
}
/*
- * Insert a previously allocated call_single_data element for execution
- * on the given CPU. data must already have ->func, ->info, and ->flags set.
+ * Insert a previously allocated call_single_data element
+ * for execution on the given CPU. data must already have
+ * ->func, ->info, and ->flags set.
*/
-static void generic_exec_single(int cpu, struct call_single_data *data)
+static
+void generic_exec_single(int cpu, struct call_single_data *data, int wait)
{
struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
- int wait = data->flags & CSD_FLAG_WAIT, ipi;
unsigned long flags;
+ int ipi;
spin_lock_irqsave(&dst->lock, flags);
ipi = list_empty(&dst->list);
@@ -74,24 +150,21 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
spin_unlock_irqrestore(&dst->lock, flags);
/*
- * Make the list addition visible before sending the ipi.
+ * The list addition should be visible before sending the IPI
+ * handler locks the list to pull the entry off it because of
+ * normal cache coherency rules implied by spinlocks.
+ *
+ * If IPIs can go out of order to the cache coherency protocol
+ * in an architecture, sufficient synchronisation should be added
+ * to arch code to make it appear to obey cache coherency WRT
+ * locking and barrier primitives. Generic code isn't really
+ * equipped to do the right thing...
*/
- smp_mb();
-
if (ipi)
arch_send_call_function_single_ipi(cpu);
if (wait)
- csd_flag_wait(data);
-}
-
-static void rcu_free_call_data(struct rcu_head *head)
-{
- struct call_function_data *data;
-
- data = container_of(head, struct call_function_data, rcu_head);
-
- kfree(data);
+ csd_lock_wait(data);
}
/*
@@ -104,99 +177,83 @@ void generic_smp_call_function_interrupt(void)
int cpu = get_cpu();
/*
- * It's ok to use list_for_each_rcu() here even though we may delete
- * 'pos', since list_del_rcu() doesn't clear ->next
+ * Ensure entry is visible on call_function_queue after we have
+ * entered the IPI. See comment in smp_call_function_many.
+ * If we don't have this, then we may miss an entry on the list
+ * and never get another IPI to process it.
+ */
+ smp_mb();
+
+ /*
+ * It's ok to use list_for_each_rcu() here even though we may
+ * delete 'pos', since list_del_rcu() doesn't clear ->next
*/
- rcu_read_lock();
- list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
+ list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
int refs;
- if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits)))
+ spin_lock(&data->lock);
+ if (!cpumask_test_cpu(cpu, data->cpumask)) {
+ spin_unlock(&data->lock);
continue;
+ }
+ cpumask_clear_cpu(cpu, data->cpumask);
+ spin_unlock(&data->lock);
data->csd.func(data->csd.info);
spin_lock(&data->lock);
- cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
WARN_ON(data->refs == 0);
- data->refs--;
- refs = data->refs;
+ refs = --data->refs;
+ if (!refs) {
+ spin_lock(&call_function.lock);
+ list_del_rcu(&data->csd.list);
+ spin_unlock(&call_function.lock);
+ }
spin_unlock(&data->lock);
if (refs)
continue;
- spin_lock(&call_function_lock);
- list_del_rcu(&data->csd.list);
- spin_unlock(&call_function_lock);
-
- if (data->csd.flags & CSD_FLAG_WAIT) {
- /*
- * serialize stores to data with the flag clear
- * and wakeup
- */
- smp_wmb();
- data->csd.flags &= ~CSD_FLAG_WAIT;
- }
- if (data->csd.flags & CSD_FLAG_ALLOC)
- call_rcu(&data->rcu_head, rcu_free_call_data);
+ csd_unlock(&data->csd);
}
- rcu_read_unlock();
put_cpu();
}
/*
- * Invoked by arch to handle an IPI for call function single. Must be called
- * from the arch with interrupts disabled.
+ * Invoked by arch to handle an IPI for call function single. Must be
+ * called from the arch with interrupts disabled.
*/
void generic_smp_call_function_single_interrupt(void)
{
struct call_single_queue *q = &__get_cpu_var(call_single_queue);
+ unsigned int data_flags;
LIST_HEAD(list);
- /*
- * Need to see other stores to list head for checking whether
- * list is empty without holding q->lock
- */
- smp_read_barrier_depends();
- while (!list_empty(&q->list)) {
- unsigned int data_flags;
-
- spin_lock(&q->lock);
- list_replace_init(&q->list, &list);
- spin_unlock(&q->lock);
-
- while (!list_empty(&list)) {
- struct call_single_data *data;
-
- data = list_entry(list.next, struct call_single_data,
- list);
- list_del(&data->list);
-
- /*
- * 'data' can be invalid after this call if
- * flags == 0 (when called through
- * generic_exec_single(), so save them away before
- * making the call.
- */
- data_flags = data->flags;
-
- data->func(data->info);
-
- if (data_flags & CSD_FLAG_WAIT) {
- smp_wmb();
- data->flags &= ~CSD_FLAG_WAIT;
- } else if (data_flags & CSD_FLAG_LOCK) {
- smp_wmb();
- data->flags &= ~CSD_FLAG_LOCK;
- } else if (data_flags & CSD_FLAG_ALLOC)
- kfree(data);
- }
+ spin_lock(&q->lock);
+ list_replace_init(&q->list, &list);
+ spin_unlock(&q->lock);
+
+ while (!list_empty(&list)) {
+ struct call_single_data *data;
+
+ data = list_entry(list.next, struct call_single_data, list);
+ list_del(&data->list);
+
+ /*
+ * 'data' can be invalid after this call if flags == 0
+ * (when called through generic_exec_single()),
+ * so save them away before making the call:
+ */
+ data_flags = data->flags;
+
+ data->func(data->info);
+
/*
- * See comment on outer loop
+ * Unlocked CSDs are valid through generic_exec_single():
*/
- smp_read_barrier_depends();
+ if (data_flags & CSD_FLAG_LOCK)
+ csd_unlock(data);
}
}
@@ -215,65 +272,45 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int wait)
{
- struct call_single_data d;
+ struct call_single_data d = {
+ .flags = 0,
+ };
unsigned long flags;
- /* prevent preemption and reschedule on another processor,
- as well as CPU removal */
- int me = get_cpu();
+ int this_cpu;
int err = 0;
+ /*
+ * prevent preemption and reschedule on another processor,
+ * as well as CPU removal
+ */
+ this_cpu = get_cpu();
+
/* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
+ WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
- if (cpu == me) {
+ if (cpu == this_cpu) {
local_irq_save(flags);
func(info);
local_irq_restore(flags);
- } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
- struct call_single_data *data;
+ } else {
+ if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
+ struct call_single_data *data = &d;
+
+ if (!wait)
+ data = &__get_cpu_var(csd_data);
- if (!wait) {
- /*
- * We are calling a function on a single CPU
- * and we are not going to wait for it to finish.
- * We first try to allocate the data, but if we
- * fail, we fall back to use a per cpu data to pass
- * the information to that CPU. Since all callers
- * of this code will use the same data, we must
- * synchronize the callers to prevent a new caller
- * from corrupting the data before the callee
- * can access it.
- *
- * The CSD_FLAG_LOCK is used to let us know when
- * the IPI handler is done with the data.
- * The first caller will set it, and the callee
- * will clear it. The next caller must wait for
- * it to clear before we set it again. This
- * will make sure the callee is done with the
- * data before a new caller will use it.
- */
- data = kmalloc(sizeof(*data), GFP_ATOMIC);
- if (data)
- data->flags = CSD_FLAG_ALLOC;
- else {
- data = &per_cpu(csd_data, me);
- while (data->flags & CSD_FLAG_LOCK)
- cpu_relax();
- data->flags = CSD_FLAG_LOCK;
- }
+ csd_lock(data);
+
+ data->func = func;
+ data->info = info;
+ generic_exec_single(cpu, data, wait);
} else {
- data = &d;
- data->flags = CSD_FLAG_WAIT;
+ err = -ENXIO; /* CPU not online */
}
-
- data->func = func;
- data->info = info;
- generic_exec_single(cpu, data);
- } else {
- err = -ENXIO; /* CPU not online */
}
put_cpu();
+
return err;
}
EXPORT_SYMBOL(smp_call_function_single);
@@ -283,23 +320,26 @@ EXPORT_SYMBOL(smp_call_function_single);
* @cpu: The CPU to run on.
* @data: Pre-allocated and setup data structure
*
- * Like smp_call_function_single(), but allow caller to pass in a pre-allocated
- * data structure. Useful for embedding @data inside other structures, for
- * instance.
- *
+ * Like smp_call_function_single(), but allow caller to pass in a
+ * pre-allocated data structure. Useful for embedding @data inside
+ * other structures, for instance.
*/
-void __smp_call_function_single(int cpu, struct call_single_data *data)
+void __smp_call_function_single(int cpu, struct call_single_data *data,
+ int wait)
{
+ csd_lock(data);
+
/* Can deadlock when called with interrupts disabled */
- WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled());
+ WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress);
- generic_exec_single(cpu, data);
+ generic_exec_single(cpu, data, wait);
}
-/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
+/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
+
#ifndef arch_send_call_function_ipi_mask
-#define arch_send_call_function_ipi_mask(maskp) \
- arch_send_call_function_ipi(*(maskp))
+# define arch_send_call_function_ipi_mask(maskp) \
+ arch_send_call_function_ipi(*(maskp))
#endif
/**
@@ -307,7 +347,8 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
* @mask: The set of cpus to run on (only runs on online subset).
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ * @wait: If true, wait (atomically) until function has completed
+ * on other CPUs.
*
* If @wait is true, then returns once @func has returned. Note that @wait
* will be implicitly turned on in case of allocation failures, since
@@ -318,27 +359,27 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
* must be disabled when calling this function.
*/
void smp_call_function_many(const struct cpumask *mask,
- void (*func)(void *), void *info,
- bool wait)
+ void (*func)(void *), void *info, bool wait)
{
struct call_function_data *data;
unsigned long flags;
- int cpu, next_cpu;
+ int cpu, next_cpu, this_cpu = smp_processor_id();
/* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
+ WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
- /* So, what's a CPU they want? Ignoring this one. */
+ /* So, what's a CPU they want? Ignoring this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
- if (cpu == smp_processor_id())
+ if (cpu == this_cpu)
cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+
/* No online cpus? We're done. */
if (cpu >= nr_cpu_ids)
return;
/* Do we have another CPU which isn't us? */
next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
- if (next_cpu == smp_processor_id())
+ if (next_cpu == this_cpu)
next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
/* Fastpath: do that cpu by itself. */
@@ -347,43 +388,40 @@ void smp_call_function_many(const struct cpumask *mask,
return;
}
- data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC);
- if (unlikely(!data)) {
- /* Slow path. */
- for_each_online_cpu(cpu) {
- if (cpu == smp_processor_id())
- continue;
- if (cpumask_test_cpu(cpu, mask))
- smp_call_function_single(cpu, func, info, wait);
- }
- return;
- }
+ data = &__get_cpu_var(cfd_data);
+ csd_lock(&data->csd);
- spin_lock_init(&data->lock);
- data->csd.flags = CSD_FLAG_ALLOC;
- if (wait)
- data->csd.flags |= CSD_FLAG_WAIT;
+ spin_lock_irqsave(&data->lock, flags);
data->csd.func = func;
data->csd.info = info;
- cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask);
- cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits));
- data->refs = cpumask_weight(to_cpumask(data->cpumask_bits));
+ cpumask_and(data->cpumask, mask, cpu_online_mask);
+ cpumask_clear_cpu(this_cpu, data->cpumask);
+ data->refs = cpumask_weight(data->cpumask);
- spin_lock_irqsave(&call_function_lock, flags);
- list_add_tail_rcu(&data->csd.list, &call_function_queue);
- spin_unlock_irqrestore(&call_function_lock, flags);
+ spin_lock(&call_function.lock);
+ /*
+ * Place entry at the _HEAD_ of the list, so that any cpu still
+ * observing the entry in generic_smp_call_function_interrupt()
+ * will not miss any other list entries:
+ */
+ list_add_rcu(&data->csd.list, &call_function.queue);
+ spin_unlock(&call_function.lock);
+
+ spin_unlock_irqrestore(&data->lock, flags);
/*
* Make the list addition visible before sending the ipi.
+ * (IPIs must obey or appear to obey normal Linux cache
+ * coherency rules -- see comment in generic_exec_single).
*/
smp_mb();
/* Send a message to all CPUs in the map */
- arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits));
+ arch_send_call_function_ipi_mask(data->cpumask);
- /* optionally wait for the CPUs to complete */
+ /* Optionally wait for the CPUs to complete */
if (wait)
- csd_flag_wait(&data->csd);
+ csd_lock_wait(&data->csd);
}
EXPORT_SYMBOL(smp_call_function_many);
@@ -391,7 +429,8 @@ EXPORT_SYMBOL(smp_call_function_many);
* smp_call_function(): Run a function on all other CPUs.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ * @wait: If true, wait (atomically) until function has completed
+ * on other CPUs.
*
* Returns 0.
*
@@ -407,26 +446,27 @@ int smp_call_function(void (*func)(void *), void *info, int wait)
preempt_disable();
smp_call_function_many(cpu_online_mask, func, info, wait);
preempt_enable();
+
return 0;
}
EXPORT_SYMBOL(smp_call_function);
void ipi_call_lock(void)
{
- spin_lock(&call_function_lock);
+ spin_lock(&call_function.lock);
}
void ipi_call_unlock(void)
{
- spin_unlock(&call_function_lock);
+ spin_unlock(&call_function.lock);
}
void ipi_call_lock_irq(void)
{
- spin_lock_irq(&call_function_lock);
+ spin_lock_irq(&call_function.lock);
}
void ipi_call_unlock_irq(void)
{
- spin_unlock_irq(&call_function_lock);
+ spin_unlock_irq(&call_function.lock);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 57d3f67f6f3..ea23ec087ee 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -180,7 +180,7 @@ asmlinkage void __do_softirq(void)
account_system_vtime(current);
__local_bh_disable((unsigned long)__builtin_return_address(0));
- trace_softirq_enter();
+ lockdep_softirq_enter();
cpu = smp_processor_id();
restart:
@@ -220,7 +220,7 @@ restart:
if (pending)
wakeup_softirqd();
- trace_softirq_exit();
+ lockdep_softirq_exit();
account_system_vtime(current);
_local_bh_enable();
@@ -496,7 +496,7 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
cp->flags = 0;
cp->priv = softirq;
- __smp_call_function_single(cpu, cp);
+ __smp_call_function_single(cpu, cp, 0);
return 0;
}
return 1;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5ec4543dfc0..82350f8f04f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -48,6 +48,7 @@
#include <linux/acpi.h>
#include <linux/reboot.h>
#include <linux/ftrace.h>
+#include <linux/slow-work.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -897,6 +898,14 @@ static struct ctl_table kern_table[] = {
.proc_handler = &scan_unevictable_handler,
},
#endif
+#ifdef CONFIG_SLOW_WORK
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "slow-work",
+ .mode = 0555,
+ .child = slow_work_sysctls,
+ },
+#endif
/*
* NOTE: do not add new entries to this table unless you have read
* Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 34e707e5ab8..504086ab444 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -72,11 +72,10 @@ config FUNCTION_GRAPH_TRACER
help
Enable the kernel to trace a function at both its return
and its entry.
- It's first purpose is to trace the duration of functions and
- draw a call graph for each thread with some informations like
- the return value.
- This is done by setting the current return address on the current
- task structure into a stack of calls.
+ Its first purpose is to trace the duration of functions and
+ draw a call graph for each thread with some information like
+ the return value. This is done by setting the current return
+ address on the current task structure into a stack of calls.
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fdf913dfc7e..53e8c8bc0c9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1908,7 +1908,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
}
/**
- * unregister_ftrace_function - unresgister a function for profiling.
+ * unregister_ftrace_function - unregister a function for profiling.
* @ops - ops structure that holds the function to unregister
*
* Unregister a function that was added to be called by ftrace profiling.