diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 1 | ||||
-rw-r--r-- | kernel/extable.c | 25 | ||||
-rw-r--r-- | kernel/lockdep.c | 16 | ||||
-rw-r--r-- | kernel/panic.c | 115 | ||||
-rw-r--r-- | kernel/power/disk.c | 4 | ||||
-rw-r--r-- | kernel/ptrace.c | 2 | ||||
-rw-r--r-- | kernel/rcupdate.c | 44 | ||||
-rw-r--r-- | kernel/sched.c | 2 | ||||
-rw-r--r-- | kernel/slow-work.c | 640 | ||||
-rw-r--r-- | kernel/smp.c | 432 | ||||
-rw-r--r-- | kernel/softirq.c | 6 | ||||
-rw-r--r-- | kernel/sysctl.c | 9 | ||||
-rw-r--r-- | kernel/trace/Kconfig | 9 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 2 |
14 files changed, 1028 insertions, 279 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index e4791b3ba55..bab1dffe37e 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -93,6 +93,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o +obj-$(CONFIG_SLOW_WORK) += slow-work.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/extable.c b/kernel/extable.c index e136ed8d82b..c46da6a4703 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -41,6 +41,14 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) return e; } +static inline int init_kernel_text(unsigned long addr) +{ + if (addr >= (unsigned long)_sinittext && + addr <= (unsigned long)_einittext) + return 1; + return 0; +} + __notrace_funcgraph int core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && @@ -48,8 +56,7 @@ __notrace_funcgraph int core_kernel_text(unsigned long addr) return 1; if (system_state == SYSTEM_BOOTING && - addr >= (unsigned long)_sinittext && - addr <= (unsigned long)_einittext) + init_kernel_text(addr)) return 1; return 0; } @@ -58,7 +65,19 @@ __notrace_funcgraph int __kernel_text_address(unsigned long addr) { if (core_kernel_text(addr)) return 1; - return __module_text_address(addr) != NULL; + if (__module_text_address(addr)) + return 1; + /* + * There might be init symbols in saved stacktraces. + * Give those symbols a chance to be printed in + * backtraces (such as lockdep traces). + * + * Since we are after the module-symbols check, there's + * no danger of address overlap: + */ + if (init_kernel_text(addr)) + return 1; + return 0; } int kernel_text_address(unsigned long addr) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3673a3f44d9..981cd485428 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -433,13 +433,6 @@ atomic_t nr_find_usage_forwards_checks; atomic_t nr_find_usage_forwards_recursions; atomic_t nr_find_usage_backwards_checks; atomic_t nr_find_usage_backwards_recursions; -# define debug_atomic_inc(ptr) atomic_inc(ptr) -# define debug_atomic_dec(ptr) atomic_dec(ptr) -# define debug_atomic_read(ptr) atomic_read(ptr) -#else -# define debug_atomic_inc(ptr) do { } while (0) -# define debug_atomic_dec(ptr) do { } while (0) -# define debug_atomic_read(ptr) 0 #endif /* @@ -1900,9 +1893,9 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, curr->comm, task_pid_nr(curr)); print_lock(this); if (forwards) - printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass); + printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); else - printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass); + printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); print_lock_name(other); printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); @@ -2015,7 +2008,8 @@ typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, enum lock_usage_bit bit, const char *name); static int -mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit) +mark_lock_irq(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) { int excl_bit = exclusive_bit(new_bit); int read = new_bit & 1; @@ -2043,7 +2037,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit) * states. */ if ((!read || !dir || STRICT_READ_CHECKS) && - !usage(curr, this, excl_bit, state_name(new_bit))) + !usage(curr, this, excl_bit, state_name(new_bit & ~1))) return 0; /* diff --git a/kernel/panic.c b/kernel/panic.c index 32fe4eff1b8..3fd8c5bf8b3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -8,19 +8,19 @@ * This function is used through-out the kernel (including mm and fs) * to indicate a major problem. */ +#include <linux/debug_locks.h> +#include <linux/interrupt.h> +#include <linux/kallsyms.h> +#include <linux/notifier.h> #include <linux/module.h> -#include <linux/sched.h> -#include <linux/delay.h> +#include <linux/random.h> #include <linux/reboot.h> -#include <linux/notifier.h> -#include <linux/init.h> +#include <linux/delay.h> +#include <linux/kexec.h> +#include <linux/sched.h> #include <linux/sysrq.h> -#include <linux/interrupt.h> +#include <linux/init.h> #include <linux/nmi.h> -#include <linux/kexec.h> -#include <linux/debug_locks.h> -#include <linux/random.h> -#include <linux/kallsyms.h> #include <linux/dmi.h> int panic_on_oops; @@ -52,19 +52,15 @@ EXPORT_SYMBOL(panic_blink); * * This function never returns. */ - NORET_TYPE void panic(const char * fmt, ...) { - long i; static char buf[1024]; va_list args; -#if defined(CONFIG_S390) - unsigned long caller = (unsigned long) __builtin_return_address(0); -#endif + long i; /* - * It's possible to come here directly from a panic-assertion and not - * have preempt disabled. Some functions called from here want + * It's possible to come here directly from a panic-assertion and + * not have preempt disabled. Some functions called from here want * preempt to be disabled. No point enabling it later though... */ preempt_disable(); @@ -77,7 +73,6 @@ NORET_TYPE void panic(const char * fmt, ...) #ifdef CONFIG_DEBUG_BUGVERBOSE dump_stack(); #endif - bust_spinlocks(0); /* * If we have crashed and we have a crash kernel loaded let it handle @@ -86,14 +81,12 @@ NORET_TYPE void panic(const char * fmt, ...) */ crash_kexec(NULL); -#ifdef CONFIG_SMP /* * Note smp_send_stop is the usual smp shutdown function, which * unfortunately means it may not be hardened to work in a panic * situation. */ smp_send_stop(); -#endif atomic_notifier_call_chain(&panic_notifier_list, 0, buf); @@ -102,19 +95,21 @@ NORET_TYPE void panic(const char * fmt, ...) if (panic_timeout > 0) { /* - * Delay timeout seconds before rebooting the machine. - * We can't use the "normal" timers since we just panicked.. - */ - printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout); + * Delay timeout seconds before rebooting the machine. + * We can't use the "normal" timers since we just panicked. + */ + printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); + for (i = 0; i < panic_timeout*1000; ) { touch_nmi_watchdog(); i += panic_blink(i); mdelay(1); i++; } - /* This will not be a clean reboot, with everything - * shutting down. But if there is a chance of - * rebooting the system it will be rebooted. + /* + * This will not be a clean reboot, with everything + * shutting down. But if there is a chance of + * rebooting the system it will be rebooted. */ emergency_restart(); } @@ -127,38 +122,44 @@ NORET_TYPE void panic(const char * fmt, ...) } #endif #if defined(CONFIG_S390) - disabled_wait(caller); + { + unsigned long caller; + + caller = (unsigned long)__builtin_return_address(0); + disabled_wait(caller); + } #endif local_irq_enable(); - for (i = 0;;) { + for (i = 0; ; ) { touch_softlockup_watchdog(); i += panic_blink(i); mdelay(1); i++; } + bust_spinlocks(0); } EXPORT_SYMBOL(panic); struct tnt { - u8 bit; - char true; - char false; + u8 bit; + char true; + char false; }; static const struct tnt tnts[] = { - { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, - { TAINT_FORCED_MODULE, 'F', ' ' }, - { TAINT_UNSAFE_SMP, 'S', ' ' }, - { TAINT_FORCED_RMMOD, 'R', ' ' }, - { TAINT_MACHINE_CHECK, 'M', ' ' }, - { TAINT_BAD_PAGE, 'B', ' ' }, - { TAINT_USER, 'U', ' ' }, - { TAINT_DIE, 'D', ' ' }, - { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, - { TAINT_WARN, 'W', ' ' }, - { TAINT_CRAP, 'C', ' ' }, + { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, + { TAINT_FORCED_MODULE, 'F', ' ' }, + { TAINT_UNSAFE_SMP, 'S', ' ' }, + { TAINT_FORCED_RMMOD, 'R', ' ' }, + { TAINT_MACHINE_CHECK, 'M', ' ' }, + { TAINT_BAD_PAGE, 'B', ' ' }, + { TAINT_USER, 'U', ' ' }, + { TAINT_DIE, 'D', ' ' }, + { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, + { TAINT_WARN, 'W', ' ' }, + { TAINT_CRAP, 'C', ' ' }, }; /** @@ -195,7 +196,8 @@ const char *print_tainted(void) *s = 0; } else snprintf(buf, sizeof(buf), "Not tainted"); - return(buf); + + return buf; } int test_taint(unsigned flag) @@ -211,7 +213,8 @@ unsigned long get_taint(void) void add_taint(unsigned flag) { - debug_locks = 0; /* can't trust the integrity of the kernel anymore */ + /* can't trust the integrity of the kernel anymore: */ + debug_locks = 0; set_bit(flag, &tainted_mask); } EXPORT_SYMBOL(add_taint); @@ -266,8 +269,8 @@ static void do_oops_enter_exit(void) } /* - * Return true if the calling CPU is allowed to print oops-related info. This - * is a bit racy.. + * Return true if the calling CPU is allowed to print oops-related info. + * This is a bit racy.. */ int oops_may_print(void) { @@ -276,20 +279,22 @@ int oops_may_print(void) /* * Called when the architecture enters its oops handler, before it prints - * anything. If this is the first CPU to oops, and it's oopsing the first time - * then let it proceed. + * anything. If this is the first CPU to oops, and it's oopsing the first + * time then let it proceed. * - * This is all enabled by the pause_on_oops kernel boot option. We do all this - * to ensure that oopses don't scroll off the screen. It has the side-effect - * of preventing later-oopsing CPUs from mucking up the display, too. + * This is all enabled by the pause_on_oops kernel boot option. We do all + * this to ensure that oopses don't scroll off the screen. It has the + * side-effect of preventing later-oopsing CPUs from mucking up the display, + * too. * - * It turns out that the CPU which is allowed to print ends up pausing for the - * right duration, whereas all the other CPUs pause for twice as long: once in - * oops_enter(), once in oops_exit(). + * It turns out that the CPU which is allowed to print ends up pausing for + * the right duration, whereas all the other CPUs pause for twice as long: + * once in oops_enter(), once in oops_exit(). */ void oops_enter(void) { - debug_locks_off(); /* can't trust the integrity of the kernel anymore */ + /* can't trust the integrity of the kernel anymore: */ + debug_locks_off(); do_oops_enter_exit(); } diff --git a/kernel/power/disk.c b/kernel/power/disk.c index f3db382c2b2..5f21ab2bbcd 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -289,7 +289,7 @@ static int create_image(int platform_mode) * hibernation_snapshot - quiesce devices and create the hibernation * snapshot image. * @platform_mode - if set, use the platform driver, if available, to - * prepare the platform frimware for the power transition. + * prepare the platform firmware for the power transition. * * Must be called with pm_mutex held */ @@ -412,7 +412,7 @@ static int resume_target_kernel(bool platform_mode) * hibernation_restore - quiesce devices and restore the hibernation * snapshot image. If successful, control returns in hibernation_snaphot() * @platform_mode - if set, use the platform driver, if available, to - * prepare the platform frimware for the transition. + * prepare the platform firmware for the transition. * * Must be called with pm_mutex held */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 5105f5a6a2c..aaad0ec3419 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -687,8 +687,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data) goto out_put_task_struct; ret = arch_ptrace(child, request, addr, data); - if (ret < 0) - goto out_put_task_struct; out_put_task_struct: put_task_struct(child); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index cae8a059cf4..2c7b8457d0d 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -122,6 +122,8 @@ static void rcu_barrier_func(void *type) } } +static inline void wait_migrated_callbacks(void); + /* * Orchestrate the specified type of RCU barrier, waiting for all * RCU callbacks of the specified type to complete. @@ -147,6 +149,7 @@ static void _rcu_barrier(enum rcu_barrier type) complete(&rcu_barrier_completion); wait_for_completion(&rcu_barrier_completion); mutex_unlock(&rcu_barrier_mutex); + wait_migrated_callbacks(); } /** @@ -176,9 +179,50 @@ void rcu_barrier_sched(void) } EXPORT_SYMBOL_GPL(rcu_barrier_sched); +static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0); +static struct rcu_head rcu_migrate_head[3]; +static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq); + +static void rcu_migrate_callback(struct rcu_head *notused) +{ + if (atomic_dec_and_test(&rcu_migrate_type_count)) + wake_up(&rcu_migrate_wq); +} + +static inline void wait_migrated_callbacks(void) +{ + wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count)); +} + +static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + if (action == CPU_DYING) { + /* + * preempt_disable() in on_each_cpu() prevents stop_machine(), + * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" + * returns, all online cpus have queued rcu_barrier_func(), + * and the dead cpu(if it exist) queues rcu_migrate_callback()s. + * + * These callbacks ensure _rcu_barrier() waits for all + * RCU callbacks of the specified type to complete. + */ + atomic_set(&rcu_migrate_type_count, 3); + call_rcu_bh(rcu_migrate_head, rcu_migrate_callback); + call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback); + call_rcu(rcu_migrate_head + 2, rcu_migrate_callback); + } else if (action == CPU_POST_DEAD) { + /* rcu_migrate_head is protected by cpu_add_remove_lock */ + wait_migrated_callbacks(); + } + + return NOTIFY_OK; +} + void __init rcu_init(void) { __rcu_init(); + hotcpu_notifier(rcu_barrier_cpu_hotplug, 0); } void rcu_scheduler_starting(void) diff --git a/kernel/sched.c b/kernel/sched.c index 73513f4e19d..2325db2be31 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1110,7 +1110,7 @@ static void hrtick_start(struct rq *rq, u64 delay) if (rq == this_rq()) { hrtimer_restart(timer); } else if (!rq->hrtick_csd_pending) { - __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); + __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); rq->hrtick_csd_pending = 1; } } diff --git a/kernel/slow-work.c b/kernel/slow-work.c new file mode 100644 index 00000000000..cf2bc01186e --- /dev/null +++ b/kernel/slow-work.c @@ -0,0 +1,640 @@ +/* Worker thread pool for slow items, such as filesystem lookups or mkdirs + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + * + * See Documentation/slow-work.txt + */ + +#include <linux/module.h> +#include <linux/slow-work.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include <linux/wait.h> + +#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of + * things to do */ +#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after + * OOM */ + +static void slow_work_cull_timeout(unsigned long); +static void slow_work_oom_timeout(unsigned long); + +#ifdef CONFIG_SYSCTL +static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); + +static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, + void __user *, size_t *, loff_t *); +#endif + +/* + * The pool of threads has at least min threads in it as long as someone is + * using the facility, and may have as many as max. + * + * A portion of the pool may be processing very slow operations. + */ +static unsigned slow_work_min_threads = 2; +static unsigned slow_work_max_threads = 4; +static unsigned vslow_work_proportion = 50; /* % of threads that may process + * very slow work */ + +#ifdef CONFIG_SYSCTL +static const int slow_work_min_min_threads = 2; +static int slow_work_max_max_threads = 255; +static const int slow_work_min_vslow = 1; +static const int slow_work_max_vslow = 99; + +ctl_table slow_work_sysctls[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "min-threads", + .data = &slow_work_min_threads, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = slow_work_min_threads_sysctl, + .extra1 = (void *) &slow_work_min_min_threads, + .extra2 = &slow_work_max_threads, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max-threads", + .data = &slow_work_max_threads, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = slow_work_max_threads_sysctl, + .extra1 = &slow_work_min_threads, + .extra2 = (void *) &slow_work_max_max_threads, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "vslow-percentage", + .data = &vslow_work_proportion, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = (void *) &slow_work_min_vslow, + .extra2 = (void *) &slow_work_max_vslow, + }, + { .ctl_name = 0 } +}; +#endif + +/* + * The active state of the thread pool + */ +static atomic_t slow_work_thread_count; +static atomic_t vslow_work_executing_count; + +static bool slow_work_may_not_start_new_thread; +static bool slow_work_cull; /* cull a thread due to lack of activity */ +static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0); +static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0); +static struct slow_work slow_work_new_thread; /* new thread starter */ + +/* + * The queues of work items and the lock governing access to them. These are + * shared between all the CPUs. It doesn't make sense to have per-CPU queues + * as the number of threads bears no relation to the number of CPUs. + * + * There are two queues of work items: one for slow work items, and one for + * very slow work items. + */ +static LIST_HEAD(slow_work_queue); +static LIST_HEAD(vslow_work_queue); +static DEFINE_SPINLOCK(slow_work_queue_lock); + +/* + * The thread controls. A variable used to signal to the threads that they + * should exit when the queue is empty, a waitqueue used by the threads to wait + * for signals, and a completion set by the last thread to exit. + */ +static bool slow_work_threads_should_exit; +static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq); +static DECLARE_COMPLETION(slow_work_last_thread_exited); + +/* + * The number of users of the thread pool and its lock. Whilst this is zero we + * have no threads hanging around, and when this reaches zero, we wait for all + * active or queued work items to complete and kill all the threads we do have. + */ +static int slow_work_user_count; +static DEFINE_MUTEX(slow_work_user_lock); + +/* + * Calculate the maximum number of active threads in the pool that are + * permitted to process very slow work items. + * + * The answer is rounded up to at least 1, but may not equal or exceed the + * maximum number of the threads in the pool. This means we always have at + * least one thread that can process slow work items, and we always have at + * least one thread that won't get tied up doing so. + */ +static unsigned slow_work_calc_vsmax(void) +{ + unsigned vsmax; + + vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion; + vsmax /= 100; + vsmax = max(vsmax, 1U); + return min(vsmax, slow_work_max_threads - 1); +} + +/* + * Attempt to execute stuff queued on a slow thread. Return true if we managed + * it, false if there was nothing to do. + */ +static bool slow_work_execute(void) +{ + struct slow_work *work = NULL; + unsigned vsmax; + bool very_slow; + + vsmax = slow_work_calc_vsmax(); + + /* see if we can schedule a new thread to be started if we're not + * keeping up with the work */ + if (!waitqueue_active(&slow_work_thread_wq) && + (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) && + atomic_read(&slow_work_thread_count) < slow_work_max_threads && + !slow_work_may_not_start_new_thread) + slow_work_enqueue(&slow_work_new_thread); + + /* find something to execute */ + spin_lock_irq(&slow_work_queue_lock); + if (!list_empty(&vslow_work_queue) && + atomic_read(&vslow_work_executing_count) < vsmax) { + work = list_entry(vslow_work_queue.next, + struct slow_work, link); + if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) + BUG(); + list_del_init(&work->link); + atomic_inc(&vslow_work_executing_count); + very_slow = true; + } else if (!list_empty(&slow_work_queue)) { + work = list_entry(slow_work_queue.next, + struct slow_work, link); + if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags)) + BUG(); + list_del_init(&work->link); + very_slow = false; + } else { + very_slow = false; /* avoid the compiler warning */ + } + spin_unlock_irq(&slow_work_queue_lock); + + if (!work) + return false; + + if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags)) + BUG(); + + work->ops->execute(work); + + if (very_slow) + atomic_dec(&vslow_work_executing_count); + clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags); + + /* if someone tried to enqueue the item whilst we were executing it, + * then it'll be left unenqueued to avoid multiple threads trying to + * execute it simultaneously + * + * there is, however, a race between us testing the pending flag and + * getting the spinlock, and between the enqueuer setting the pending + * flag and getting the spinlock, so we use a deferral bit to tell us + * if the enqueuer got there first + */ + if (test_bit(SLOW_WORK_PENDING, &work->flags)) { + spin_lock_irq(&slow_work_queue_lock); + + if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) && + test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) + goto auto_requeue; + + spin_unlock_irq(&slow_work_queue_lock); + } + + work->ops->put_ref(work); + return true; + +auto_requeue: + /* we must complete the enqueue operation + * - we transfer our ref on the item back to the appropriate queue + * - don't wake another thread up as we're awake already + */ + if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) + list_add_tail(&work->link, &vslow_work_queue); + else + list_add_tail(&work->link, &slow_work_queue); + spin_unlock_irq(&slow_work_queue_lock); + return true; +} + +/** + * slow_work_enqueue - Schedule a slow work item for processing + * @work: The work item to queue + * + * Schedule a slow work item for processing. If the item is already undergoing + * execution, this guarantees not to re-enter the execution routine until the + * first execution finishes. + * + * The item is pinned by this function as it retains a reference to it, managed + * through the item operations. The item is unpinned once it has been + * executed. + * + * An item may hog the thread that is running it for a relatively large amount + * of time, sufficient, for example, to perform several lookup, mkdir, create + * and setxattr operations. It may sleep on I/O and may sleep to obtain locks. + * + * Conversely, if a number of items are awaiting processing, it may take some + * time before any given item is given attention. The number of threads in the + * pool may be increased to deal with demand, but only up to a limit. + * + * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in + * the very slow queue, from which only a portion of the threads will be + * allowed to pick items to execute. This ensures that very slow items won't + * overly block ones that are just ordinarily slow. + * + * Returns 0 if successful, -EAGAIN if not. + */ +int slow_work_enqueue(struct slow_work *work) +{ + unsigned long flags; + + BUG_ON(slow_work_user_count <= 0); + BUG_ON(!work); + BUG_ON(!work->ops); + BUG_ON(!work->ops->get_ref); + + /* when honouring an enqueue request, we only promise that we will run + * the work function in the future; we do not promise to run it once + * per enqueue request + * + * we use the PENDING bit to merge together repeat requests without + * having to disable IRQs and take the spinlock, whilst still + * maintaining our promise + */ + if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) { + spin_lock_irqsave(&slow_work_queue_lock, flags); + + /* we promise that we will not attempt to execute the work + * function in more than one thread simultaneously + * + * this, however, leaves us with a problem if we're asked to + * enqueue the work whilst someone is executing the work + * function as simply queueing the work immediately means that + * another thread may try executing it whilst it is already + * under execution + * + * to deal with this, we set the ENQ_DEFERRED bit instead of + * enqueueing, and the thread currently executing the work + * function will enqueue the work item when the work function + * returns and it has cleared the EXECUTING bit + */ + if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) { + set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags); + } else { + if (work->ops->get_ref(work) < 0) + goto cant_get_ref; + if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) + list_add_tail(&work->link, &vslow_work_queue); + else + list_add_tail(&work->link, &slow_work_queue); + wake_up(&slow_work_thread_wq); + } + + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + } + return 0; + +cant_get_ref: + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + return -EAGAIN; +} +EXPORT_SYMBOL(slow_work_enqueue); + +/* + * Worker thread culling algorithm + */ +static bool slow_work_cull_thread(void) +{ + unsigned long flags; + bool do_cull = false; + + spin_lock_irqsave(&slow_work_queue_lock, flags); + + if (slow_work_cull) { + slow_work_cull = false; + + if (list_empty(&slow_work_queue) && + list_empty(&vslow_work_queue) && + atomic_read(&slow_work_thread_count) > + slow_work_min_threads) { + mod_timer(&slow_work_cull_timer, + jiffies + SLOW_WORK_CULL_TIMEOUT); + do_cull = true; + } + } + + spin_unlock_irqrestore(&slow_work_queue_lock, flags); + return do_cull; +} + +/* + * Determine if there is slow work available for dispatch + */ +static inline bool slow_work_available(int vsmax) +{ + return !list_empty(&slow_work_queue) || + (!list_empty(&vslow_work_queue) && + atomic_read(&vslow_work_executing_count) < vsmax); +} + +/* + * Worker thread dispatcher + */ +static int slow_work_thread(void *_data) +{ + int vsmax; + + DEFINE_WAIT(wait); + + set_freezable(); + set_user_nice(current, -5); + + for (;;) { + vsmax = vslow_work_proportion; + vsmax *= atomic_read(&slow_work_thread_count); + vsmax /= 100; + + prepare_to_wait(&slow_work_thread_wq, &wait, + TASK_INTERRUPTIBLE); + if (!freezing(current) && + !slow_work_threads_should_exit && + !slow_work_available(vsmax) && + !slow_work_cull) + schedule(); + finish_wait(&slow_work_thread_wq, &wait); + + try_to_freeze(); + + vsmax = vslow_work_proportion; + vsmax *= atomic_read(&slow_work_thread_count); + vsmax /= 100; + + if (slow_work_available(vsmax) && slow_work_execute()) { + cond_resched(); + if (list_empty(&slow_work_queue) && + list_empty(&vslow_work_queue) && + atomic_read(&slow_work_thread_count) > + slow_work_min_threads) + mod_timer(&slow_work_cull_timer, + jiffies + SLOW_WORK_CULL_TIMEOUT); + continue; + } + + if (slow_work_threads_should_exit) + break; + + if (slow_work_cull && slow_work_cull_thread()) + break; + } + + if (atomic_dec_and_test(&slow_work_thread_count)) + complete_and_exit(&slow_work_last_thread_exited, 0); + return 0; +} + +/* + * Handle thread cull timer expiration + */ +static void slow_work_cull_timeout(unsigned long data) +{ + slow_work_cull = true; + wake_up(&slow_work_thread_wq); +} + +/* + * Get a reference on slow work thread starter + */ +static int slow_work_new_thread_get_ref(struct slow_work *work) +{ + return 0; +} + +/* + * Drop a reference on slow work thread starter + */ +static void slow_work_new_thread_put_ref(struct slow_work *work) +{ +} + +/* + * Start a new slow work thread + */ +static void slow_work_new_thread_execute(struct slow_work *work) +{ + struct task_struct *p; + + if (slow_work_threads_should_exit) + return; + + if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads) + return; + + if (!mutex_trylock(&slow_work_user_lock)) + return; + + slow_work_may_not_start_new_thread = true; + atomic_inc(&slow_work_thread_count); + p = kthread_run(slow_work_thread, NULL, "kslowd"); + if (IS_ERR(p)) { + printk(KERN_DEBUG "Slow work thread pool: OOM\n"); + if (atomic_dec_and_test(&slow_work_thread_count)) + BUG(); /* we're running on a slow work thread... */ + mod_timer(&slow_work_oom_timer, + jiffies + SLOW_WORK_OOM_TIMEOUT); + } else { + /* ratelimit the starting of new threads */ + mod_timer(&slow_work_oom_timer, jiffies + 1); + } + + mutex_unlock(&slow_work_user_lock); +} + +static const struct slow_work_ops slow_work_new_thread_ops = { + .get_ref = slow_work_new_thread_get_ref, + .put_ref = slow_work_new_thread_put_ref, + .execute = slow_work_new_thread_execute, +}; + +/* + * post-OOM new thread start suppression expiration + */ +static void slow_work_oom_timeout(unsigned long data) +{ + slow_work_may_not_start_new_thread = false; +} + +#ifdef CONFIG_SYSCTL +/* + * Handle adjustment of the minimum number of threads + */ +static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int n; + + if (ret == 0) { + mutex_lock(&slow_work_user_lock); + if (slow_work_user_count > 0) { + /* see if we need to start or stop threads */ + n = atomic_read(&slow_work_thread_count) - + slow_work_min_threads; + + if (n < 0 && !slow_work_may_not_start_new_thread) + slow_work_enqueue(&slow_work_new_thread); + else if (n > 0) + mod_timer(&slow_work_cull_timer, + jiffies + SLOW_WORK_CULL_TIMEOUT); + } + mutex_unlock(&slow_work_user_lock); + } + + return ret; +} + +/* + * Handle adjustment of the maximum number of threads + */ +static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int n; + + if (ret == 0) { + mutex_lock(&slow_work_user_lock); + if (slow_work_user_count > 0) { + /* see if we need to stop threads */ + n = slow_work_max_threads - + atomic_read(&slow_work_thread_count); + + if (n < 0) + mod_timer(&slow_work_cull_timer, + jiffies + SLOW_WORK_CULL_TIMEOUT); + } + mutex_unlock(&slow_work_user_lock); + } + + return ret; +} +#endif /* CONFIG_SYSCTL */ + +/** + * slow_work_register_user - Register a user of the facility + * + * Register a user of the facility, starting up the initial threads if there + * aren't any other users at this point. This will return 0 if successful, or + * an error if not. + */ +int slow_work_register_user(void) +{ + struct task_struct *p; + int loop; + + mutex_lock(&slow_work_user_lock); + + if (slow_work_user_count == 0) { + printk(KERN_NOTICE "Slow work thread pool: Starting up\n"); + init_completion(&slow_work_last_thread_exited); + + slow_work_threads_should_exit = false; + slow_work_init(&slow_work_new_thread, + &slow_work_new_thread_ops); + slow_work_may_not_start_new_thread = false; + slow_work_cull = false; + + /* start the minimum number of threads */ + for (loop = 0; loop < slow_work_min_threads; loop++) { + atomic_inc(&slow_work_thread_count); + p = kthread_run(slow_work_thread, NULL, "kslowd"); + if (IS_ERR(p)) + goto error; + } + printk(KERN_NOTICE "Slow work thread pool: Ready\n"); + } + + slow_work_user_count++; + mutex_unlock(&slow_work_user_lock); + return 0; + +error: + if (atomic_dec_and_test(&slow_work_thread_count)) + complete(&slow_work_last_thread_exited); + if (loop > 0) { + printk(KERN_ERR "Slow work thread pool:" + " Aborting startup on ENOMEM\n"); + slow_work_threads_should_exit = true; + wake_up_all(&slow_work_thread_wq); + wait_for_completion(&slow_work_last_thread_exited); + printk(KERN_ERR "Slow work thread pool: Aborted\n"); + } + mutex_unlock(&slow_work_user_lock); + return PTR_ERR(p); +} +EXPORT_SYMBOL(slow_work_register_user); + +/** + * slow_work_unregister_user - Unregister a user of the facility + * + * Unregister a user of the facility, killing all the threads if this was the + * last one. + */ +void slow_work_unregister_user(void) +{ + mutex_lock(&slow_work_user_lock); + + BUG_ON(slow_work_user_count <= 0); + + slow_work_user_count--; + if (slow_work_user_count == 0) { + printk(KERN_NOTICE "Slow work thread pool: Shutting down\n"); + slow_work_threads_should_exit = true; + wake_up_all(&slow_work_thread_wq); + wait_for_completion(&slow_work_last_thread_exited); + printk(KERN_NOTICE "Slow work thread pool:" + " Shut down complete\n"); + } + + del_timer_sync(&slow_work_cull_timer); + + mutex_unlock(&slow_work_user_lock); +} +EXPORT_SYMBOL(slow_work_unregister_user); + +/* + * Initialise the slow work facility + */ +static int __init init_slow_work(void) +{ + unsigned nr_cpus = num_possible_cpus(); + + if (slow_work_max_threads < nr_cpus) + slow_work_max_threads = nr_cpus; +#ifdef CONFIG_SYSCTL + if (slow_work_max_max_threads < nr_cpus * 2) + slow_work_max_max_threads = nr_cpus * 2; +#endif + return 0; +} + +subsys_initcall(init_slow_work); diff --git a/kernel/smp.c b/kernel/smp.c index bbedbb7efe3..858baac568e 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -2,40 +2,82 @@ * Generic helpers for smp ipi calls * * (C) Jens Axboe <jens.axboe@oracle.com> 2008 - * */ -#include <linux/init.h> -#include <linux/module.h> -#include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/rculist.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/init.h> #include <linux/smp.h> +#include <linux/cpu.h> static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); -static LIST_HEAD(call_function_queue); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock); + +static struct { + struct list_head queue; + spinlock_t lock; +} call_function __cacheline_aligned_in_smp = + { + .queue = LIST_HEAD_INIT(call_function.queue), + .lock = __SPIN_LOCK_UNLOCKED(call_function.lock), + }; enum { - CSD_FLAG_WAIT = 0x01, - CSD_FLAG_ALLOC = 0x02, - CSD_FLAG_LOCK = 0x04, + CSD_FLAG_LOCK = 0x01, }; struct call_function_data { - struct call_single_data csd; - spinlock_t lock; - unsigned int refs; - struct rcu_head rcu_head; - unsigned long cpumask_bits[]; + struct call_single_data csd; + spinlock_t lock; + unsigned int refs; + cpumask_var_t cpumask; }; struct call_single_queue { - struct list_head list; - spinlock_t lock; + struct list_head list; + spinlock_t lock; +}; + +static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { + .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock), +}; + +static int +hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + struct call_function_data *cfd = &per_cpu(cfd_data, cpu); + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, + cpu_to_node(cpu))) + return NOTIFY_BAD; + break; + +#ifdef CONFIG_CPU_HOTPLUG + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + + case CPU_DEAD: + case CPU_DEAD_FROZEN: + free_cpumask_var(cfd->cpumask); + break; +#endif + }; + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { + .notifier_call = hotplug_cfd, }; static int __cpuinit init_call_single_data(void) { + void *cpu = (void *)(long)smp_processor_id(); int i; for_each_possible_cpu(i) { @@ -44,29 +86,63 @@ static int __cpuinit init_call_single_data(void) spin_lock_init(&q->lock); INIT_LIST_HEAD(&q->list); } + + hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); + register_cpu_notifier(&hotplug_cfd_notifier); + return 0; } early_initcall(init_call_single_data); -static void csd_flag_wait(struct call_single_data *data) +/* + * csd_lock/csd_unlock used to serialize access to per-cpu csd resources + * + * For non-synchronous ipi calls the csd can still be in use by the + * previous function call. For multi-cpu calls its even more interesting + * as we'll have to ensure no other cpu is observing our csd. + */ +static void csd_lock_wait(struct call_single_data *data) { - /* Wait for response */ - do { - if (!(data->flags & CSD_FLAG_WAIT)) - break; + while (data->flags & CSD_FLAG_LOCK) cpu_relax(); - } while (1); +} + +static void csd_lock(struct call_single_data *data) +{ + csd_lock_wait(data); + data->flags = CSD_FLAG_LOCK; + + /* + * prevent CPU from reordering the above assignment + * to ->flags with any subsequent assignments to other + * fields of the specified call_single_data structure: + */ + smp_mb(); +} + +static void csd_unlock(struct call_single_data *data) +{ + WARN_ON(!(data->flags & CSD_FLAG_LOCK)); + + /* + * ensure we're all done before releasing data: + */ + smp_mb(); + + data->flags &= ~CSD_FLAG_LOCK; } /* - * Insert a previously allocated call_single_data element for execution - * on the given CPU. data must already have ->func, ->info, and ->flags set. + * Insert a previously allocated call_single_data element + * for execution on the given CPU. data must already have + * ->func, ->info, and ->flags set. */ -static void generic_exec_single(int cpu, struct call_single_data *data) +static +void generic_exec_single(int cpu, struct call_single_data *data, int wait) { struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); - int wait = data->flags & CSD_FLAG_WAIT, ipi; unsigned long flags; + int ipi; spin_lock_irqsave(&dst->lock, flags); ipi = list_empty(&dst->list); @@ -74,24 +150,21 @@ static void generic_exec_single(int cpu, struct call_single_data *data) spin_unlock_irqrestore(&dst->lock, flags); /* - * Make the list addition visible before sending the ipi. + * The list addition should be visible before sending the IPI + * handler locks the list to pull the entry off it because of + * normal cache coherency rules implied by spinlocks. + * + * If IPIs can go out of order to the cache coherency protocol + * in an architecture, sufficient synchronisation should be added + * to arch code to make it appear to obey cache coherency WRT + * locking and barrier primitives. Generic code isn't really + * equipped to do the right thing... */ - smp_mb(); - if (ipi) arch_send_call_function_single_ipi(cpu); if (wait) - csd_flag_wait(data); -} - -static void rcu_free_call_data(struct rcu_head *head) -{ - struct call_function_data *data; - - data = container_of(head, struct call_function_data, rcu_head); - - kfree(data); + csd_lock_wait(data); } /* @@ -104,99 +177,83 @@ void generic_smp_call_function_interrupt(void) int cpu = get_cpu(); /* - * It's ok to use list_for_each_rcu() here even though we may delete - * 'pos', since list_del_rcu() doesn't clear ->next + * Ensure entry is visible on call_function_queue after we have + * entered the IPI. See comment in smp_call_function_many. + * If we don't have this, then we may miss an entry on the list + * and never get another IPI to process it. + */ + smp_mb(); + + /* + * It's ok to use list_for_each_rcu() here even though we may + * delete 'pos', since list_del_rcu() doesn't clear ->next */ - rcu_read_lock(); - list_for_each_entry_rcu(data, &call_function_queue, csd.list) { + list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; - if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits))) + spin_lock(&data->lock); + if (!cpumask_test_cpu(cpu, data->cpumask)) { + spin_unlock(&data->lock); continue; + } + cpumask_clear_cpu(cpu, data->cpumask); + spin_unlock(&data->lock); data->csd.func(data->csd.info); spin_lock(&data->lock); - cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits)); WARN_ON(data->refs == 0); - data->refs--; - refs = data->refs; + refs = --data->refs; + if (!refs) { + spin_lock(&call_function.lock); + list_del_rcu(&data->csd.list); + spin_unlock(&call_function.lock); + } spin_unlock(&data->lock); if (refs) continue; - spin_lock(&call_function_lock); - list_del_rcu(&data->csd.list); - spin_unlock(&call_function_lock); - - if (data->csd.flags & CSD_FLAG_WAIT) { - /* - * serialize stores to data with the flag clear - * and wakeup - */ - smp_wmb(); - data->csd.flags &= ~CSD_FLAG_WAIT; - } - if (data->csd.flags & CSD_FLAG_ALLOC) - call_rcu(&data->rcu_head, rcu_free_call_data); + csd_unlock(&data->csd); } - rcu_read_unlock(); put_cpu(); } /* - * Invoked by arch to handle an IPI for call function single. Must be called - * from the arch with interrupts disabled. + * Invoked by arch to handle an IPI for call function single. Must be + * called from the arch with interrupts disabled. */ void generic_smp_call_function_single_interrupt(void) { struct call_single_queue *q = &__get_cpu_var(call_single_queue); + unsigned int data_flags; LIST_HEAD(list); - /* - * Need to see other stores to list head for checking whether - * list is empty without holding q->lock - */ - smp_read_barrier_depends(); - while (!list_empty(&q->list)) { - unsigned int data_flags; - - spin_lock(&q->lock); - list_replace_init(&q->list, &list); - spin_unlock(&q->lock); - - while (!list_empty(&list)) { - struct call_single_data *data; - - data = list_entry(list.next, struct call_single_data, - list); - list_del(&data->list); - - /* - * 'data' can be invalid after this call if - * flags == 0 (when called through - * generic_exec_single(), so save them away before - * making the call. - */ - data_flags = data->flags; - - data->func(data->info); - - if (data_flags & CSD_FLAG_WAIT) { - smp_wmb(); - data->flags &= ~CSD_FLAG_WAIT; - } else if (data_flags & CSD_FLAG_LOCK) { - smp_wmb(); - data->flags &= ~CSD_FLAG_LOCK; - } else if (data_flags & CSD_FLAG_ALLOC) - kfree(data); - } + spin_lock(&q->lock); + list_replace_init(&q->list, &list); + spin_unlock(&q->lock); + + while (!list_empty(&list)) { + struct call_single_data *data; + + data = list_entry(list.next, struct call_single_data, list); + list_del(&data->list); + + /* + * 'data' can be invalid after this call if flags == 0 + * (when called through generic_exec_single()), + * so save them away before making the call: + */ + data_flags = data->flags; + + data->func(data->info); + /* - * See comment on outer loop + * Unlocked CSDs are valid through generic_exec_single(): */ - smp_read_barrier_depends(); + if (data_flags & CSD_FLAG_LOCK) + csd_unlock(data); } } @@ -215,65 +272,45 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data); int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) { - struct call_single_data d; + struct call_single_data d = { + .flags = 0, + }; unsigned long flags; - /* prevent preemption and reschedule on another processor, - as well as CPU removal */ - int me = get_cpu(); + int this_cpu; int err = 0; + /* + * prevent preemption and reschedule on another processor, + * as well as CPU removal + */ + this_cpu = get_cpu(); + /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); + WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); - if (cpu == me) { + if (cpu == this_cpu) { local_irq_save(flags); func(info); local_irq_restore(flags); - } else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { - struct call_single_data *data; + } else { + if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { + struct call_single_data *data = &d; + + if (!wait) + data = &__get_cpu_var(csd_data); - if (!wait) { - /* - * We are calling a function on a single CPU - * and we are not going to wait for it to finish. - * We first try to allocate the data, but if we - * fail, we fall back to use a per cpu data to pass - * the information to that CPU. Since all callers - * of this code will use the same data, we must - * synchronize the callers to prevent a new caller - * from corrupting the data before the callee - * can access it. - * - * The CSD_FLAG_LOCK is used to let us know when - * the IPI handler is done with the data. - * The first caller will set it, and the callee - * will clear it. The next caller must wait for - * it to clear before we set it again. This - * will make sure the callee is done with the - * data before a new caller will use it. - */ - data = kmalloc(sizeof(*data), GFP_ATOMIC); - if (data) - data->flags = CSD_FLAG_ALLOC; - else { - data = &per_cpu(csd_data, me); - while (data->flags & CSD_FLAG_LOCK) - cpu_relax(); - data->flags = CSD_FLAG_LOCK; - } + csd_lock(data); + + data->func = func; + data->info = info; + generic_exec_single(cpu, data, wait); } else { - data = &d; - data->flags = CSD_FLAG_WAIT; + err = -ENXIO; /* CPU not online */ } - - data->func = func; - data->info = info; - generic_exec_single(cpu, data); - } else { - err = -ENXIO; /* CPU not online */ } put_cpu(); + return err; } EXPORT_SYMBOL(smp_call_function_single); @@ -283,23 +320,26 @@ EXPORT_SYMBOL(smp_call_function_single); * @cpu: The CPU to run on. * @data: Pre-allocated and setup data structure * - * Like smp_call_function_single(), but allow caller to pass in a pre-allocated - * data structure. Useful for embedding @data inside other structures, for - * instance. - * + * Like smp_call_function_single(), but allow caller to pass in a + * pre-allocated data structure. Useful for embedding @data inside + * other structures, for instance. */ -void __smp_call_function_single(int cpu, struct call_single_data *data) +void __smp_call_function_single(int cpu, struct call_single_data *data, + int wait) { + csd_lock(data); + /* Can deadlock when called with interrupts disabled */ - WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled()); + WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); - generic_exec_single(cpu, data); + generic_exec_single(cpu, data, wait); } -/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */ +/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */ + #ifndef arch_send_call_function_ipi_mask -#define arch_send_call_function_ipi_mask(maskp) \ - arch_send_call_function_ipi(*(maskp)) +# define arch_send_call_function_ipi_mask(maskp) \ + arch_send_call_function_ipi(*(maskp)) #endif /** @@ -307,7 +347,8 @@ void __smp_call_function_single(int cpu, struct call_single_data *data) * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed on other CPUs. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. * * If @wait is true, then returns once @func has returned. Note that @wait * will be implicitly turned on in case of allocation failures, since @@ -318,27 +359,27 @@ void __smp_call_function_single(int cpu, struct call_single_data *data) * must be disabled when calling this function. */ void smp_call_function_many(const struct cpumask *mask, - void (*func)(void *), void *info, - bool wait) + void (*func)(void *), void *info, bool wait) { struct call_function_data *data; unsigned long flags; - int cpu, next_cpu; + int cpu, next_cpu, this_cpu = smp_processor_id(); /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); + WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); - /* So, what's a CPU they want? Ignoring this one. */ + /* So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); - if (cpu == smp_processor_id()) + if (cpu == this_cpu) cpu = cpumask_next_and(cpu, mask, cpu_online_mask); + /* No online cpus? We're done. */ if (cpu >= nr_cpu_ids) return; /* Do we have another CPU which isn't us? */ next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask); - if (next_cpu == smp_processor_id()) + if (next_cpu == this_cpu) next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask); /* Fastpath: do that cpu by itself. */ @@ -347,43 +388,40 @@ void smp_call_function_many(const struct cpumask *mask, return; } - data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC); - if (unlikely(!data)) { - /* Slow path. */ - for_each_online_cpu(cpu) { - if (cpu == smp_processor_id()) - continue; - if (cpumask_test_cpu(cpu, mask)) - smp_call_function_single(cpu, func, info, wait); - } - return; - } + data = &__get_cpu_var(cfd_data); + csd_lock(&data->csd); - spin_lock_init(&data->lock); - data->csd.flags = CSD_FLAG_ALLOC; - if (wait) - data->csd.flags |= CSD_FLAG_WAIT; + spin_lock_irqsave(&data->lock, flags); data->csd.func = func; data->csd.info = info; - cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits)); - data->refs = cpumask_weight(to_cpumask(data->cpumask_bits)); + cpumask_and(data->cpumask, mask, cpu_online_mask); + cpumask_clear_cpu(this_cpu, data->cpumask); + data->refs = cpumask_weight(data->cpumask); - spin_lock_irqsave(&call_function_lock, flags); - list_add_tail_rcu(&data->csd.list, &call_function_queue); - spin_unlock_irqrestore(&call_function_lock, flags); + spin_lock(&call_function.lock); + /* + * Place entry at the _HEAD_ of the list, so that any cpu still + * observing the entry in generic_smp_call_function_interrupt() + * will not miss any other list entries: + */ + list_add_rcu(&data->csd.list, &call_function.queue); + spin_unlock(&call_function.lock); + + spin_unlock_irqrestore(&data->lock, flags); /* * Make the list addition visible before sending the ipi. + * (IPIs must obey or appear to obey normal Linux cache + * coherency rules -- see comment in generic_exec_single). */ smp_mb(); /* Send a message to all CPUs in the map */ - arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits)); + arch_send_call_function_ipi_mask(data->cpumask); - /* optionally wait for the CPUs to complete */ + /* Optionally wait for the CPUs to complete */ if (wait) - csd_flag_wait(&data->csd); + csd_lock_wait(&data->csd); } EXPORT_SYMBOL(smp_call_function_many); @@ -391,7 +429,8 @@ EXPORT_SYMBOL(smp_call_function_many); * smp_call_function(): Run a function on all other CPUs. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed on other CPUs. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. * * Returns 0. * @@ -407,26 +446,27 @@ int smp_call_function(void (*func)(void *), void *info, int wait) preempt_disable(); smp_call_function_many(cpu_online_mask, func, info, wait); preempt_enable(); + return 0; } EXPORT_SYMBOL(smp_call_function); void ipi_call_lock(void) { - spin_lock(&call_function_lock); + spin_lock(&call_function.lock); } void ipi_call_unlock(void) { - spin_unlock(&call_function_lock); + spin_unlock(&call_function.lock); } void ipi_call_lock_irq(void) { - spin_lock_irq(&call_function_lock); + spin_lock_irq(&call_function.lock); } void ipi_call_unlock_irq(void) { - spin_unlock_irq(&call_function_lock); + spin_unlock_irq(&call_function.lock); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 57d3f67f6f3..ea23ec087ee 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -180,7 +180,7 @@ asmlinkage void __do_softirq(void) account_system_vtime(current); __local_bh_disable((unsigned long)__builtin_return_address(0)); - trace_softirq_enter(); + lockdep_softirq_enter(); cpu = smp_processor_id(); restart: @@ -220,7 +220,7 @@ restart: if (pending) wakeup_softirqd(); - trace_softirq_exit(); + lockdep_softirq_exit(); account_system_vtime(current); _local_bh_enable(); @@ -496,7 +496,7 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir cp->flags = 0; cp->priv = softirq; - __smp_call_function_single(cpu, cp); + __smp_call_function_single(cpu, cp, 0); return 0; } return 1; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5ec4543dfc0..82350f8f04f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -48,6 +48,7 @@ #include <linux/acpi.h> #include <linux/reboot.h> #include <linux/ftrace.h> +#include <linux/slow-work.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -897,6 +898,14 @@ static struct ctl_table kern_table[] = { .proc_handler = &scan_unevictable_handler, }, #endif +#ifdef CONFIG_SLOW_WORK + { + .ctl_name = CTL_UNNUMBERED, + .procname = "slow-work", + .mode = 0555, + .child = slow_work_sysctls, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 34e707e5ab8..504086ab444 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -72,11 +72,10 @@ config FUNCTION_GRAPH_TRACER help Enable the kernel to trace a function at both its return and its entry. - It's first purpose is to trace the duration of functions and - draw a call graph for each thread with some informations like - the return value. - This is done by setting the current return address on the current - task structure into a stack of calls. + Its first purpose is to trace the duration of functions and + draw a call graph for each thread with some information like + the return value. This is done by setting the current return + address on the current task structure into a stack of calls. config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fdf913dfc7e..53e8c8bc0c9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1908,7 +1908,7 @@ int register_ftrace_function(struct ftrace_ops *ops) } /** - * unregister_ftrace_function - unresgister a function for profiling. + * unregister_ftrace_function - unregister a function for profiling. * @ops - ops structure that holds the function to unregister * * Unregister a function that was added to be called by ftrace profiling. |