diff options
Diffstat (limited to 'kernel')
47 files changed, 2529 insertions, 1113 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 4d9bf5f8531..e2c9d52cfe9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -87,6 +87,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o obj-$(CONFIG_TINY_RCU) += rcutiny.o +obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c9483d8f614..291ba3d04be 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -138,7 +138,7 @@ struct css_id { * is called after synchronize_rcu(). But for safe use, css_is_removed() * css_tryget() should be used for avoiding race. */ - struct cgroup_subsys_state *css; + struct cgroup_subsys_state __rcu *css; /* * ID of this css. */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b23c0979bbe..51b143e2a07 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - ret = security_task_setscheduler(tsk, 0, NULL); + ret = security_task_setscheduler(tsk); if (ret) return ret; if (threadgroup) { @@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, rcu_read_lock(); list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - ret = security_task_setscheduler(c, 0, NULL); + ret = security_task_setscheduler(c); if (ret) { rcu_read_unlock(); return ret; diff --git a/kernel/futex.c b/kernel/futex.c index 6a3a5fa1526..a118bf160e0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -91,6 +91,7 @@ struct futex_pi_state { /** * struct futex_q - The hashed futex queue entry, one per waiting task + * @list: priority-sorted list of tasks waiting on this futex * @task: the task waiting on the futex * @lock_ptr: the hash bucket lock * @key: the key the futex is hashed on @@ -104,7 +105,7 @@ struct futex_pi_state { * * A futex_q has a woken state, just like tasks have TASK_RUNNING. * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. - * The order of wakup is always to make the first condition true, then + * The order of wakeup is always to make the first condition true, then * the second. * * PI futexes are typically woken before they are removed from the hash list via @@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key) * Slow path to fixup the fault we just took in the atomic write * access to @uaddr. * - * We have no generic implementation of a non destructive write to the + * We have no generic implementation of a non-destructive write to the * user address. We know that we faulted in the atomic pagefault * disabled section so we can as well avoid the #PF overhead by * calling get_user_pages() right away. @@ -515,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, */ pi_state = this->pi_state; /* - * Userspace might have messed up non PI and PI futexes + * Userspace might have messed up non-PI and PI futexes */ if (unlikely(!pi_state)) return -EINVAL; @@ -736,8 +737,8 @@ static void wake_futex(struct futex_q *q) /* * We set q->lock_ptr = NULL _before_ we wake up the task. If - * a non futex wake up happens on another CPU then the task - * might exit and p would dereference a non existing task + * a non-futex wake up happens on another CPU then the task + * might exit and p would dereference a non-existing task * struct. Prevent this by holding a reference on p across the * wake up. */ @@ -1131,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, /** * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 - * uaddr1: source futex user address - * uaddr2: target futex user address - * nr_wake: number of waiters to wake (must be 1 for requeue_pi) - * nr_requeue: number of waiters to requeue (0-INT_MAX) - * requeue_pi: if we are attempting to requeue from a non-pi futex to a + * @uaddr1: source futex user address + * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @uaddr2: target futex user address + * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * @nr_requeue: number of waiters to requeue (0-INT_MAX) + * @cmpval: @uaddr1 expected value (or %NULL) + * @requeue_pi: if we are attempting to requeue from a non-pi futex to a * pi futex (pi to pi requeue is not supported) * * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire @@ -1360,10 +1363,10 @@ out: /* The key must be already stored in q->key. */ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) + __acquires(&hb->lock) { struct futex_hash_bucket *hb; - get_futex_key_refs(&q->key); hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; @@ -1373,9 +1376,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) static inline void queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) { spin_unlock(&hb->lock); - drop_futex_key_refs(&q->key); } /** @@ -1391,6 +1394,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) * an example). */ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) { int prio; @@ -1471,6 +1475,7 @@ retry: * and dropped here. */ static void unqueue_me_pi(struct futex_q *q) + __releases(q->lock_ptr) { WARN_ON(plist_node_empty(&q->list)); plist_del(&q->list, &q->list.plist); @@ -1480,8 +1485,6 @@ static void unqueue_me_pi(struct futex_q *q) q->pi_state = NULL; spin_unlock(q->lock_ptr); - - drop_futex_key_refs(&q->key); } /* @@ -1812,7 +1815,10 @@ static int futex_wait(u32 __user *uaddr, int fshared, } retry: - /* Prepare to wait on uaddr. */ + /* + * Prepare to wait on uaddr. On success, holds hb lock and increments + * q.key refs. + */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) goto out; @@ -1822,28 +1828,27 @@ retry: /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; + /* unqueue_me() drops q.key ref */ if (!unqueue_me(&q)) - goto out_put_key; + goto out; ret = -ETIMEDOUT; if (to && !to->task) - goto out_put_key; + goto out; /* * We expect signal_pending(current), but we might be the * victim of a spurious wakeup as well. */ - if (!signal_pending(current)) { - put_futex_key(fshared, &q.key); + if (!signal_pending(current)) goto retry; - } ret = -ERESTARTSYS; if (!abs_time) - goto out_put_key; + goto out; restart = ¤t_thread_info()->restart_block; restart->fn = futex_wait_restart; - restart->futex.uaddr = (u32 *)uaddr; + restart->futex.uaddr = uaddr; restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; @@ -1856,8 +1861,6 @@ retry: ret = -ERESTART_RESTARTBLOCK; -out_put_key: - put_futex_key(fshared, &q.key); out: if (to) { hrtimer_cancel(&to->timer); @@ -1869,7 +1872,7 @@ out: static long futex_wait_restart(struct restart_block *restart) { - u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; + u32 __user *uaddr = restart->futex.uaddr; int fshared = 0; ktime_t t, *tp = NULL; @@ -2236,7 +2239,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, q.rt_waiter = &rt_waiter; q.requeue_pi_key = &key2; - /* Prepare to wait on uaddr. */ + /* + * Prepare to wait on uaddr. On success, increments q.key (key1) ref + * count. + */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) goto out_key2; @@ -2254,7 +2260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, * In order for us to be here, we know our q.key == key2, and since * we took the hb->lock above, we also know that futex_requeue() has * completed and we no longer have to concern ourselves with a wakeup - * race with the atomic proxy lock acquition by the requeue code. + * race with the atomic proxy lock acquisition by the requeue code. The + * futex_requeue dropped our key1 reference and incremented our key2 + * reference count. */ /* Check if the requeue code acquired the second futex for us. */ @@ -2458,7 +2466,7 @@ retry: */ static inline int fetch_robust_entry(struct robust_list __user **entry, struct robust_list __user * __user *head, - int *pi) + unsigned int *pi) { unsigned long uentry; @@ -2647,7 +2655,7 @@ static int __init futex_init(void) * of the complex code paths. Also we want to prevent * registration of robust lists in that case. NULL is * guaranteed to fault and we get -EFAULT on functional - * implementation, the non functional ones will return + * implementation, the non-functional ones will return * -ENOSYS. */ curval = cmpxchg_futex_value_locked(NULL, 0, 0); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d49afb2395e..06da4dfc339 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -19,7 +19,7 @@ */ static inline int fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, - compat_uptr_t __user *head, int *pi) + compat_uptr_t __user *head, unsigned int *pi) { if (get_user(*uentry, head)) return -EFAULT; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 0c642d51aac..53ead174da2 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - __debug_show_held_locks(t); + debug_show_held_locks(t); touch_nmi_watchdog(); @@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * periodically exit the critical section and enter a new one. * * For preemptible RCU it is sufficient to call rcu_read_unlock in order - * exit the grace period. For classic RCU, a reschedule is required. + * to exit the grace period. For classic RCU, a reschedule is required. */ static void rcu_lock_break(struct task_struct *g, struct task_struct *t) { diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig new file mode 100644 index 00000000000..31d766bf5d2 --- /dev/null +++ b/kernel/irq/Kconfig @@ -0,0 +1,53 @@ +config HAVE_GENERIC_HARDIRQS + def_bool n + +if HAVE_GENERIC_HARDIRQS +menu "IRQ subsystem" +# +# Interrupt subsystem related configuration options +# +config GENERIC_HARDIRQS + def_bool y + +config GENERIC_HARDIRQS_NO__DO_IRQ + def_bool y + +# Select this to disable the deprecated stuff +config GENERIC_HARDIRQS_NO_DEPRECATED + def_bool n + +# Options selectable by the architecture code +config HAVE_SPARSE_IRQ + def_bool n + +config GENERIC_IRQ_PROBE + def_bool n + +config GENERIC_PENDING_IRQ + def_bool n + +config AUTO_IRQ_AFFINITY + def_bool n + +config IRQ_PER_CPU + def_bool n + +config HARDIRQS_SW_RESEND + def_bool n + +config SPARSE_IRQ + bool "Support sparse irq numbering" + depends on HAVE_SPARSE_IRQ + ---help--- + + Sparse irq numbering is useful for distro kernels that want + to define a high CONFIG_NR_CPUS value but still want to have + low kernel memory footprint on smaller machines. + + ( Sparse irqs can also be beneficial on NUMA boxes, as they spread + out the interrupt descriptors in a more NUMA-friendly way. ) + + If you don't know what to do here, say N. + +endmenu +endif diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 7d047808419..54329cd7b3e 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,7 +1,6 @@ -obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o +obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o -obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 2295a31ef11..505798f86c3 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -57,9 +57,10 @@ unsigned long probe_irq_on(void) * Some chips need to know about probing in * progress: */ - if (desc->chip->set_type) - desc->chip->set_type(i, IRQ_TYPE_PROBE); - desc->chip->startup(i); + if (desc->irq_data.chip->irq_set_type) + desc->irq_data.chip->irq_set_type(&desc->irq_data, + IRQ_TYPE_PROBE); + desc->irq_data.chip->irq_startup(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); } @@ -76,7 +77,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && !(desc->status & IRQ_NOPROBE)) { desc->status |= IRQ_AUTODETECT | IRQ_WAITING; - if (desc->chip->startup(i)) + if (desc->irq_data.chip->irq_startup(&desc->irq_data)) desc->status |= IRQ_PENDING; } raw_spin_unlock_irq(&desc->lock); @@ -98,7 +99,7 @@ unsigned long probe_irq_on(void) /* It triggered already - consider it spurious. */ if (!(status & IRQ_WAITING)) { desc->status = status & ~IRQ_AUTODETECT; - desc->chip->shutdown(i); + desc->irq_data.chip->irq_shutdown(&desc->irq_data); } else if (i < 32) mask |= 1 << i; @@ -137,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val) mask |= 1 << i; desc->status = status & ~IRQ_AUTODETECT; - desc->chip->shutdown(i); + desc->irq_data.chip->irq_shutdown(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); } @@ -181,7 +182,7 @@ int probe_irq_off(unsigned long val) nr_of_irqs++; } desc->status = status & ~IRQ_AUTODETECT; - desc->chip->shutdown(i); + desc->irq_data.chip->irq_shutdown(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b7091d5ca2f..baa5c4acad8 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -18,108 +18,6 @@ #include "internals.h" -static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) -{ - struct irq_desc *desc; - unsigned long flags; - - desc = irq_to_desc(irq); - if (!desc) { - WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); - return; - } - - /* Ensure we don't have left over values from a previous use of this irq */ - raw_spin_lock_irqsave(&desc->lock, flags); - desc->status = IRQ_DISABLED; - desc->chip = &no_irq_chip; - desc->handle_irq = handle_bad_irq; - desc->depth = 1; - desc->msi_desc = NULL; - desc->handler_data = NULL; - if (!keep_chip_data) - desc->chip_data = NULL; - desc->action = NULL; - desc->irq_count = 0; - desc->irqs_unhandled = 0; -#ifdef CONFIG_SMP - cpumask_setall(desc->affinity); -#ifdef CONFIG_GENERIC_PENDING_IRQ - cpumask_clear(desc->pending_mask); -#endif -#endif - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - -/** - * dynamic_irq_init - initialize a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_init(unsigned int irq) -{ - dynamic_irq_init_x(irq, false); -} - -/** - * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq - * @irq: irq number to initialize - * - * does not set irq_to_desc(irq)->chip_data to NULL - */ -void dynamic_irq_init_keep_chip_data(unsigned int irq) -{ - dynamic_irq_init_x(irq, true); -} - -static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - if (!desc) { - WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); - return; - } - - raw_spin_lock_irqsave(&desc->lock, flags); - if (desc->action) { - raw_spin_unlock_irqrestore(&desc->lock, flags); - WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", - irq); - return; - } - desc->msi_desc = NULL; - desc->handler_data = NULL; - if (!keep_chip_data) - desc->chip_data = NULL; - desc->handle_irq = handle_bad_irq; - desc->chip = &no_irq_chip; - desc->name = NULL; - clear_kstat_irqs(desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - -/** - * dynamic_irq_cleanup - cleanup a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_cleanup(unsigned int irq) -{ - dynamic_irq_cleanup_x(irq, false); -} - -/** - * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq - * @irq: irq number to initialize - * - * does not set irq_to_desc(irq)->chip_data to NULL - */ -void dynamic_irq_cleanup_keep_chip_data(unsigned int irq) -{ - dynamic_irq_cleanup_x(irq, true); -} - - /** * set_irq_chip - set the irq chip for an irq * @irq: irq number @@ -140,7 +38,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip) raw_spin_lock_irqsave(&desc->lock, flags); irq_chip_set_defaults(chip); - desc->chip = chip; + desc->irq_data.chip = chip; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; @@ -193,7 +91,7 @@ int set_irq_data(unsigned int irq, void *data) } raw_spin_lock_irqsave(&desc->lock, flags); - desc->handler_data = data; + desc->irq_data.handler_data = data; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -218,7 +116,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry) } raw_spin_lock_irqsave(&desc->lock, flags); - desc->msi_desc = entry; + desc->irq_data.msi_desc = entry; if (entry) entry->irq = irq; raw_spin_unlock_irqrestore(&desc->lock, flags); @@ -243,19 +141,27 @@ int set_irq_chip_data(unsigned int irq, void *data) return -EINVAL; } - if (!desc->chip) { + if (!desc->irq_data.chip) { printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); return -EINVAL; } raw_spin_lock_irqsave(&desc->lock, flags); - desc->chip_data = data; + desc->irq_data.chip_data = data; raw_spin_unlock_irqrestore(&desc->lock, flags); return 0; } EXPORT_SYMBOL(set_irq_chip_data); +struct irq_data *irq_get_irq_data(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return desc ? &desc->irq_data : NULL; +} +EXPORT_SYMBOL_GPL(irq_get_irq_data); + /** * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq * @@ -287,93 +193,216 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread); /* * default enable function */ -static void default_enable(unsigned int irq) +static void default_enable(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = irq_data_to_desc(data); - desc->chip->unmask(irq); + desc->irq_data.chip->irq_unmask(&desc->irq_data); desc->status &= ~IRQ_MASKED; } /* * default disable function */ -static void default_disable(unsigned int irq) +static void default_disable(struct irq_data *data) { } /* * default startup function */ -static unsigned int default_startup(unsigned int irq) +static unsigned int default_startup(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = irq_data_to_desc(data); - desc->chip->enable(irq); + desc->irq_data.chip->irq_enable(data); return 0; } /* * default shutdown function */ -static void default_shutdown(unsigned int irq) +static void default_shutdown(struct irq_data *data) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc = irq_data_to_desc(data); - desc->chip->mask(irq); + desc->irq_data.chip->irq_mask(&desc->irq_data); desc->status |= IRQ_MASKED; } +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED +/* Temporary migration helpers */ +static void compat_irq_mask(struct irq_data *data) +{ + data->chip->mask(data->irq); +} + +static void compat_irq_unmask(struct irq_data *data) +{ + data->chip->unmask(data->irq); +} + +static void compat_irq_ack(struct irq_data *data) +{ + data->chip->ack(data->irq); +} + +static void compat_irq_mask_ack(struct irq_data *data) +{ + data->chip->mask_ack(data->irq); +} + +static void compat_irq_eoi(struct irq_data *data) +{ + data->chip->eoi(data->irq); +} + +static void compat_irq_enable(struct irq_data *data) +{ + data->chip->enable(data->irq); +} + +static void compat_irq_disable(struct irq_data *data) +{ + data->chip->disable(data->irq); +} + +static void compat_irq_shutdown(struct irq_data *data) +{ + data->chip->shutdown(data->irq); +} + +static unsigned int compat_irq_startup(struct irq_data *data) +{ + return data->chip->startup(data->irq); +} + +static int compat_irq_set_affinity(struct irq_data *data, + const struct cpumask *dest, bool force) +{ + return data->chip->set_affinity(data->irq, dest); +} + +static int compat_irq_set_type(struct irq_data *data, unsigned int type) +{ + return data->chip->set_type(data->irq, type); +} + +static int compat_irq_set_wake(struct irq_data *data, unsigned int on) +{ + return data->chip->set_wake(data->irq, on); +} + +static int compat_irq_retrigger(struct irq_data *data) +{ + return data->chip->retrigger(data->irq); +} + +static void compat_bus_lock(struct irq_data *data) +{ + data->chip->bus_lock(data->irq); +} + +static void compat_bus_sync_unlock(struct irq_data *data) +{ + data->chip->bus_sync_unlock(data->irq); +} +#endif + /* * Fixup enable/disable function pointers */ void irq_chip_set_defaults(struct irq_chip *chip) { - if (!chip->enable) - chip->enable = default_enable; - if (!chip->disable) - chip->disable = default_disable; - if (!chip->startup) - chip->startup = default_startup; +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED /* - * We use chip->disable, when the user provided its own. When - * we have default_disable set for chip->disable, then we need + * Compat fixup functions need to be before we set the + * defaults for enable/disable/startup/shutdown + */ + if (chip->enable) + chip->irq_enable = compat_irq_enable; + if (chip->disable) + chip->irq_disable = compat_irq_disable; + if (chip->shutdown) + chip->irq_shutdown = compat_irq_shutdown; + if (chip->startup) + chip->irq_startup = compat_irq_startup; +#endif + /* + * The real defaults + */ + if (!chip->irq_enable) + chip->irq_enable = default_enable; + if (!chip->irq_disable) + chip->irq_disable = default_disable; + if (!chip->irq_startup) + chip->irq_startup = default_startup; + /* + * We use chip->irq_disable, when the user provided its own. When + * we have default_disable set for chip->irq_disable, then we need * to use default_shutdown, otherwise the irq line is not * disabled on free_irq(): */ - if (!chip->shutdown) - chip->shutdown = chip->disable != default_disable ? - chip->disable : default_shutdown; - if (!chip->name) - chip->name = chip->typename; + if (!chip->irq_shutdown) + chip->irq_shutdown = chip->irq_disable != default_disable ? + chip->irq_disable : default_shutdown; + +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED if (!chip->end) chip->end = dummy_irq_chip.end; + + /* + * Now fix up the remaining compat handlers + */ + if (chip->bus_lock) + chip->irq_bus_lock = compat_bus_lock; + if (chip->bus_sync_unlock) + chip->irq_bus_sync_unlock = compat_bus_sync_unlock; + if (chip->mask) + chip->irq_mask = compat_irq_mask; + if (chip->unmask) + chip->irq_unmask = compat_irq_unmask; + if (chip->ack) + chip->irq_ack = compat_irq_ack; + if (chip->mask_ack) + chip->irq_mask_ack = compat_irq_mask_ack; + if (chip->eoi) + chip->irq_eoi = compat_irq_eoi; + if (chip->set_affinity) + chip->irq_set_affinity = compat_irq_set_affinity; + if (chip->set_type) + chip->irq_set_type = compat_irq_set_type; + if (chip->set_wake) + chip->irq_set_wake = compat_irq_set_wake; + if (chip->retrigger) + chip->irq_retrigger = compat_irq_retrigger; +#endif } -static inline void mask_ack_irq(struct irq_desc *desc, int irq) +static inline void mask_ack_irq(struct irq_desc *desc) { - if (desc->chip->mask_ack) - desc->chip->mask_ack(irq); + if (desc->irq_data.chip->irq_mask_ack) + desc->irq_data.chip->irq_mask_ack(&desc->irq_data); else { - desc->chip->mask(irq); - if (desc->chip->ack) - desc->chip->ack(irq); + desc->irq_data.chip->irq_mask(&desc->irq_data); + if (desc->irq_data.chip->irq_ack) + desc->irq_data.chip->irq_ack(&desc->irq_data); } desc->status |= IRQ_MASKED; } -static inline void mask_irq(struct irq_desc *desc, int irq) +static inline void mask_irq(struct irq_desc *desc) { - if (desc->chip->mask) { - desc->chip->mask(irq); + if (desc->irq_data.chip->irq_mask) { + desc->irq_data.chip->irq_mask(&desc->irq_data); desc->status |= IRQ_MASKED; } } -static inline void unmask_irq(struct irq_desc *desc, int irq) +static inline void unmask_irq(struct irq_desc *desc) { - if (desc->chip->unmask) { - desc->chip->unmask(irq); + if (desc->irq_data.chip->irq_unmask) { + desc->irq_data.chip->irq_unmask(&desc->irq_data); desc->status &= ~IRQ_MASKED; } } @@ -476,7 +505,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) irqreturn_t action_ret; raw_spin_lock(&desc->lock); - mask_ack_irq(desc, irq); + mask_ack_irq(desc); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; @@ -502,7 +531,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) desc->status &= ~IRQ_INPROGRESS; if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) - unmask_irq(desc, irq); + unmask_irq(desc); out_unlock: raw_spin_unlock(&desc->lock); } @@ -539,7 +568,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) { desc->status |= IRQ_PENDING; - mask_irq(desc, irq); + mask_irq(desc); goto out; } @@ -554,7 +583,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; out: - desc->chip->eoi(irq); + desc->irq_data.chip->irq_eoi(&desc->irq_data); raw_spin_unlock(&desc->lock); } @@ -590,14 +619,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || !desc->action)) { desc->status |= (IRQ_PENDING | IRQ_MASKED); - mask_ack_irq(desc, irq); + mask_ack_irq(desc); goto out_unlock; } kstat_incr_irqs_this_cpu(irq, desc); /* Start handling the irq */ - if (desc->chip->ack) - desc->chip->ack(irq); + desc->irq_data.chip->irq_ack(&desc->irq_data); /* Mark the IRQ currently in progress.*/ desc->status |= IRQ_INPROGRESS; @@ -607,7 +635,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) irqreturn_t action_ret; if (unlikely(!action)) { - mask_irq(desc, irq); + mask_irq(desc); goto out_unlock; } @@ -619,7 +647,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->status & (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == (IRQ_PENDING | IRQ_MASKED))) { - unmask_irq(desc, irq); + unmask_irq(desc); } desc->status &= ~IRQ_PENDING; @@ -650,15 +678,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) kstat_incr_irqs_this_cpu(irq, desc); - if (desc->chip->ack) - desc->chip->ack(irq); + if (desc->irq_data.chip->irq_ack) + desc->irq_data.chip->irq_ack(&desc->irq_data); action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); - if (desc->chip->eoi) - desc->chip->eoi(irq); + if (desc->irq_data.chip->irq_eoi) + desc->irq_data.chip->irq_eoi(&desc->irq_data); } void @@ -676,7 +704,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (!handle) handle = handle_bad_irq; - else if (desc->chip == &no_irq_chip) { + else if (desc->irq_data.chip == &no_irq_chip) { printk(KERN_WARNING "Trying to install %sinterrupt handler " "for IRQ%d\n", is_chained ? "chained " : "", irq); /* @@ -686,16 +714,16 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, * prevent us to setup the interrupt at all. Switch it to * dummy_irq_chip for easy transition. */ - desc->chip = &dummy_irq_chip; + desc->irq_data.chip = &dummy_irq_chip; } - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); /* Uninstall? */ if (handle == handle_bad_irq) { - if (desc->chip != &no_irq_chip) - mask_ack_irq(desc, irq); + if (desc->irq_data.chip != &no_irq_chip) + mask_ack_irq(desc); desc->status |= IRQ_DISABLED; desc->depth = 1; } @@ -706,10 +734,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, desc->status &= ~IRQ_DISABLED; desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; desc->depth = 0; - desc->chip->startup(irq); + desc->irq_data.chip->irq_startup(&desc->irq_data); } raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL_GPL(__set_irq_handler); @@ -729,32 +757,20 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, __set_irq_handler(irq, handle, 0, name); } -void set_irq_noprobe(unsigned int irq) +void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; - if (!desc) { - printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); + if (!desc) return; - } - - raw_spin_lock_irqsave(&desc->lock, flags); - desc->status |= IRQ_NOPROBE; - raw_spin_unlock_irqrestore(&desc->lock, flags); -} - -void set_irq_probe(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - if (!desc) { - printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); - return; - } + /* Sanitize flags */ + set &= IRQF_MODIFY_MASK; + clr &= IRQF_MODIFY_MASK; raw_spin_lock_irqsave(&desc->lock, flags); - desc->status &= ~IRQ_NOPROBE; + desc->status &= ~clr; + desc->status |= set; raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c new file mode 100644 index 00000000000..20dc5474947 --- /dev/null +++ b/kernel/irq/dummychip.c @@ -0,0 +1,68 @@ +/* + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the dummy interrupt chip implementation + */ +#include <linux/interrupt.h> +#include <linux/irq.h> + +#include "internals.h" + +/* + * What should we do if we get a hw irq event on an illegal vector? + * Each architecture has to answer this themself. + */ +static void ack_bad(struct irq_data *data) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + print_irq_desc(data->irq, desc); + ack_bad_irq(data->irq); +} + +/* + * NOP functions + */ +static void noop(struct irq_data *data) { } + +static unsigned int noop_ret(struct irq_data *data) +{ + return 0; +} + +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED +static void compat_noop(unsigned int irq) { } +#define END_INIT .end = compat_noop +#else +#define END_INIT +#endif + +/* + * Generic no controller implementation + */ +struct irq_chip no_irq_chip = { + .name = "none", + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = ack_bad, + END_INIT +}; + +/* + * Generic dummy implementation which can be used for + * real dumb interrupt sources + */ +struct irq_chip dummy_irq_chip = { + .name = "dummy", + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = noop, + .irq_mask = noop, + .irq_unmask = noop, + END_INIT +}; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 27e5c691122..e2347eb6330 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,24 +11,15 @@ */ #include <linux/irq.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/module.h> #include <linux/random.h> +#include <linux/sched.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> -#include <linux/rculist.h> -#include <linux/hash.h> -#include <linux/radix-tree.h> + #include <trace/events/irq.h> #include "internals.h" -/* - * lockdep: we want to handle all irq_desc locks as a single lock-class: - */ -struct lock_class_key irq_desc_lock_class; - /** * handle_bad_irq - handle spurious and unhandled irqs * @irq: the interrupt number @@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) ack_bad_irq(irq); } -#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) -static void __init init_irq_default_affinity(void) -{ - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); -} -#else -static void __init init_irq_default_affinity(void) -{ -} -#endif - -/* - * Linux has a controller-independent interrupt architecture. - * Every controller has a 'controller-template', that is used - * by the main code to do the right thing. Each driver-visible - * interrupt source is transparently wired to the appropriate - * controller. Thus drivers need not be aware of the - * interrupt-controller. - * - * The code is designed to be easily extended with new/different - * interrupt controllers, without having to do assembly magic or - * having to touch the generic code. - * - * Controller mappings for all interrupt sources: - */ -int nr_irqs = NR_IRQS; -EXPORT_SYMBOL_GPL(nr_irqs); - -#ifdef CONFIG_SPARSE_IRQ - -static struct irq_desc irq_desc_init = { - .irq = -1, - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -}; - -void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) -{ - void *ptr; - - ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), - GFP_ATOMIC, node); - - /* - * don't overwite if can not get new one - * init_copy_kstat_irqs() could still use old one - */ - if (ptr) { - printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node); - desc->kstat_irqs = ptr; - } -} - -static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) -{ - memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); - - raw_spin_lock_init(&desc->lock); - desc->irq = irq; -#ifdef CONFIG_SMP - desc->node = node; -#endif - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_kstat_irqs(desc, node, nr_cpu_ids); - if (!desc->kstat_irqs) { - printk(KERN_ERR "can not alloc kstat_irqs\n"); - BUG_ON(1); - } - if (!alloc_desc_masks(desc, node, false)) { - printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); - BUG_ON(1); - } - init_desc_masks(desc); - arch_init_chip_data(desc, node); -} - -/* - * Protect the sparse_irqs: - */ -DEFINE_RAW_SPINLOCK(sparse_irq_lock); - -static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); - -static void set_irq_desc(unsigned int irq, struct irq_desc *desc) -{ - radix_tree_insert(&irq_desc_tree, irq, desc); -} - -struct irq_desc *irq_to_desc(unsigned int irq) -{ - return radix_tree_lookup(&irq_desc_tree, irq); -} - -void replace_irq_desc(unsigned int irq, struct irq_desc *desc) -{ - void **ptr; - - ptr = radix_tree_lookup_slot(&irq_desc_tree, irq); - if (ptr) - radix_tree_replace_slot(ptr, desc); -} - -static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { - [0 ... NR_IRQS_LEGACY-1] = { - .irq = -1, - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock), - } -}; - -static unsigned int *kstat_irqs_legacy; - -int __init early_irq_init(void) -{ - struct irq_desc *desc; - int legacy_count; - int node; - int i; - - init_irq_default_affinity(); - - /* initialize nr_irqs based on nr_cpu_ids */ - arch_probe_nr_irqs(); - printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); - - desc = irq_desc_legacy; - legacy_count = ARRAY_SIZE(irq_desc_legacy); - node = first_online_node; - - /* allocate based on nr_cpu_ids */ - kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * - sizeof(int), GFP_NOWAIT, node); - - for (i = 0; i < legacy_count; i++) { - desc[i].irq = i; -#ifdef CONFIG_SMP - desc[i].node = node; -#endif - desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; - lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - alloc_desc_masks(&desc[i], node, true); - init_desc_masks(&desc[i]); - set_irq_desc(i, &desc[i]); - } - - return arch_early_irq_init(); -} - -struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) -{ - struct irq_desc *desc; - unsigned long flags; - - if (irq >= nr_irqs) { - WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", - irq, nr_irqs); - return NULL; - } - - desc = irq_to_desc(irq); - if (desc) - return desc; - - raw_spin_lock_irqsave(&sparse_irq_lock, flags); - - /* We have to check it to avoid races with another CPU */ - desc = irq_to_desc(irq); - if (desc) - goto out_unlock; - - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - - printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); - if (!desc) { - printk(KERN_ERR "can not alloc irq_desc\n"); - BUG_ON(1); - } - init_one_irq_desc(irq, desc, node); - - set_irq_desc(irq, desc); - -out_unlock: - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); - - return desc; -} - -#else /* !CONFIG_SPARSE_IRQ */ - -struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { - [0 ... NR_IRQS-1] = { - .status = IRQ_DISABLED, - .chip = &no_irq_chip, - .handle_irq = handle_bad_irq, - .depth = 1, - .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), - } -}; - -static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; -int __init early_irq_init(void) -{ - struct irq_desc *desc; - int count; - int i; - - init_irq_default_affinity(); - - printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); - - desc = irq_desc; - count = ARRAY_SIZE(irq_desc); - - for (i = 0; i < count; i++) { - desc[i].irq = i; - alloc_desc_masks(&desc[i], 0, true); - init_desc_masks(&desc[i]); - desc[i].kstat_irqs = kstat_irqs_all[i]; - } - return arch_early_irq_init(); -} - -struct irq_desc *irq_to_desc(unsigned int irq) -{ - return (irq < NR_IRQS) ? irq_desc + irq : NULL; -} - -struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) -{ - return irq_to_desc(irq); -} -#endif /* !CONFIG_SPARSE_IRQ */ - -void clear_kstat_irqs(struct irq_desc *desc) -{ - memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); -} - -/* - * What should we do if we get a hw irq event on an illegal vector? - * Each architecture has to answer this themself. - */ -static void ack_bad(unsigned int irq) -{ - struct irq_desc *desc = irq_to_desc(irq); - - print_irq_desc(irq, desc); - ack_bad_irq(irq); -} - -/* - * NOP functions - */ -static void noop(unsigned int irq) -{ -} - -static unsigned int noop_ret(unsigned int irq) -{ - return 0; -} - -/* - * Generic no controller implementation - */ -struct irq_chip no_irq_chip = { - .name = "none", - .startup = noop_ret, - .shutdown = noop, - .enable = noop, - .disable = noop, - .ack = ack_bad, - .end = noop, -}; - -/* - * Generic dummy implementation which can be used for - * real dumb interrupt sources - */ -struct irq_chip dummy_irq_chip = { - .name = "dummy", - .startup = noop_ret, - .shutdown = noop, - .enable = noop, - .disable = noop, - .ack = noop, - .mask = noop, - .unmask = noop, - .end = noop, -}; - /* * Special, empty irq handler: */ @@ -457,20 +150,20 @@ unsigned int __do_IRQ(unsigned int irq) /* * No locking required for CPU-local interrupts: */ - if (desc->chip->ack) - desc->chip->ack(irq); + if (desc->irq_data.chip->ack) + desc->irq_data.chip->ack(irq); if (likely(!(desc->status & IRQ_DISABLED))) { action_ret = handle_IRQ_event(irq, desc->action); if (!noirqdebug) note_interrupt(irq, desc, action_ret); } - desc->chip->end(irq); + desc->irq_data.chip->end(irq); return 1; } raw_spin_lock(&desc->lock); - if (desc->chip->ack) - desc->chip->ack(irq); + if (desc->irq_data.chip->ack) + desc->irq_data.chip->ack(irq); /* * REPLAY is when Linux resends an IRQ that was dropped earlier * WAITING is used by probe to mark irqs that are being tested @@ -530,27 +223,9 @@ out: * The ->end() handler has to deal with interrupts which got * disabled while the handler was running. */ - desc->chip->end(irq); + desc->irq_data.chip->end(irq); raw_spin_unlock(&desc->lock); return 1; } #endif - -void early_init_irq_lock_class(void) -{ - struct irq_desc *desc; - int i; - - for_each_irq_desc(i, desc) { - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - } -} - -unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) -{ - struct irq_desc *desc = irq_to_desc(irq); - return desc ? desc->kstat_irqs[cpu] : 0; -} -EXPORT_SYMBOL(kstat_irqs_cpu); - diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c63f3bc88f0..4571ae7e085 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -1,9 +1,12 @@ /* * IRQ subsystem internal functions and variables: */ +#include <linux/irqdesc.h> extern int noirqdebug; +#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) + /* Set default functions for irq_chip structures: */ extern void irq_chip_set_defaults(struct irq_chip *chip); @@ -15,21 +18,19 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); -extern struct lock_class_key irq_desc_lock_class; extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); -extern void clear_kstat_irqs(struct irq_desc *desc); -extern raw_spinlock_t sparse_irq_lock; -#ifdef CONFIG_SPARSE_IRQ -void replace_irq_desc(unsigned int irq, struct irq_desc *desc); -#endif +/* Resending of interrupts :*/ +void check_irq_resend(struct irq_desc *desc, unsigned int irq); #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); +extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc); extern void register_handler_proc(unsigned int irq, struct irqaction *action); extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); #else static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } +static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { } static inline void register_handler_proc(unsigned int irq, struct irqaction *action) { } static inline void unregister_handler_proc(unsigned int irq, @@ -40,17 +41,27 @@ extern int irq_select_affinity_usr(unsigned int irq); extern void irq_set_thread_affinity(struct irq_desc *desc); +#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED +static inline void irq_end(unsigned int irq, struct irq_desc *desc) +{ + if (desc->irq_data.chip && desc->irq_data.chip->end) + desc->irq_data.chip->end(irq); +} +#else +static inline void irq_end(unsigned int irq, struct irq_desc *desc) { } +#endif + /* Inline functions for support of irq chips on slow busses */ -static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) +static inline void chip_bus_lock(struct irq_desc *desc) { - if (unlikely(desc->chip->bus_lock)) - desc->chip->bus_lock(irq); + if (unlikely(desc->irq_data.chip->irq_bus_lock)) + desc->irq_data.chip->irq_bus_lock(&desc->irq_data); } -static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) +static inline void chip_bus_sync_unlock(struct irq_desc *desc) { - if (unlikely(desc->chip->bus_sync_unlock)) - desc->chip->bus_sync_unlock(irq); + if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock)) + desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); } /* @@ -67,8 +78,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); printk("->handle_irq(): %p, ", desc->handle_irq); print_symbol("%s\n", (unsigned long)desc->handle_irq); - printk("->chip(): %p, ", desc->chip); - print_symbol("%s\n", (unsigned long)desc->chip); + printk("->irq_data.chip(): %p, ", desc->irq_data.chip); + print_symbol("%s\n", (unsigned long)desc->irq_data.chip); printk("->action(): %p\n", desc->action); if (desc->action) { printk("->action->handler(): %p, ", desc->action->handler); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c new file mode 100644 index 00000000000..9d917ff7267 --- /dev/null +++ b/kernel/irq/irqdesc.c @@ -0,0 +1,395 @@ +/* + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the interrupt descriptor management code + * + * Detailed information is available in Documentation/DocBook/genericirq + * + */ +#include <linux/irq.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/kernel_stat.h> +#include <linux/radix-tree.h> +#include <linux/bitmap.h> + +#include "internals.h" + +/* + * lockdep: we want to handle all irq_desc locks as a single lock-class: + */ +static struct lock_class_key irq_desc_lock_class; + +#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) +static void __init init_irq_default_affinity(void) +{ + alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpumask_setall(irq_default_affinity); +} +#else +static void __init init_irq_default_affinity(void) +{ +} +#endif + +#ifdef CONFIG_SMP +static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) +{ + if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) + return -ENOMEM; + +#ifdef CONFIG_GENERIC_PENDING_IRQ + if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { + free_cpumask_var(desc->irq_data.affinity); + return -ENOMEM; + } +#endif + return 0; +} + +static void desc_smp_init(struct irq_desc *desc, int node) +{ + desc->irq_data.node = node; + cpumask_copy(desc->irq_data.affinity, irq_default_affinity); +#ifdef CONFIG_GENERIC_PENDING_IRQ + cpumask_clear(desc->pending_mask); +#endif +} + +static inline int desc_node(struct irq_desc *desc) +{ + return desc->irq_data.node; +} + +#else +static inline int +alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } +static inline void desc_smp_init(struct irq_desc *desc, int node) { } +static inline int desc_node(struct irq_desc *desc) { return 0; } +#endif + +static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) +{ + desc->irq_data.irq = irq; + desc->irq_data.chip = &no_irq_chip; + desc->irq_data.chip_data = NULL; + desc->irq_data.handler_data = NULL; + desc->irq_data.msi_desc = NULL; + desc->status = IRQ_DEFAULT_INIT_FLAGS; + desc->handle_irq = handle_bad_irq; + desc->depth = 1; + desc->irq_count = 0; + desc->irqs_unhandled = 0; + desc->name = NULL; + memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs))); + desc_smp_init(desc, node); +} + +int nr_irqs = NR_IRQS; +EXPORT_SYMBOL_GPL(nr_irqs); + +static DEFINE_MUTEX(sparse_irq_lock); +static DECLARE_BITMAP(allocated_irqs, NR_IRQS); + +#ifdef CONFIG_SPARSE_IRQ + +static RADIX_TREE(irq_desc_tree, GFP_KERNEL); + +static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) +{ + radix_tree_insert(&irq_desc_tree, irq, desc); +} + +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return radix_tree_lookup(&irq_desc_tree, irq); +} + +static void delete_irq_desc(unsigned int irq) +{ + radix_tree_delete(&irq_desc_tree, irq); +} + +#ifdef CONFIG_SMP +static void free_masks(struct irq_desc *desc) +{ +#ifdef CONFIG_GENERIC_PENDING_IRQ + free_cpumask_var(desc->pending_mask); +#endif + free_cpumask_var(desc->irq_data.affinity); +} +#else +static inline void free_masks(struct irq_desc *desc) { } +#endif + +static struct irq_desc *alloc_desc(int irq, int node) +{ + struct irq_desc *desc; + gfp_t gfp = GFP_KERNEL; + + desc = kzalloc_node(sizeof(*desc), gfp, node); + if (!desc) + return NULL; + /* allocate based on nr_cpu_ids */ + desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs), + gfp, node); + if (!desc->kstat_irqs) + goto err_desc; + + if (alloc_masks(desc, gfp, node)) + goto err_kstat; + + raw_spin_lock_init(&desc->lock); + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + + desc_set_defaults(irq, desc, node); + + return desc; + +err_kstat: + kfree(desc->kstat_irqs); +err_desc: + kfree(desc); + return NULL; +} + +static void free_desc(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + unregister_irq_proc(irq, desc); + + mutex_lock(&sparse_irq_lock); + delete_irq_desc(irq); + mutex_unlock(&sparse_irq_lock); + + free_masks(desc); + kfree(desc->kstat_irqs); + kfree(desc); +} + +static int alloc_descs(unsigned int start, unsigned int cnt, int node) +{ + struct irq_desc *desc; + int i; + + for (i = 0; i < cnt; i++) { + desc = alloc_desc(start + i, node); + if (!desc) + goto err; + mutex_lock(&sparse_irq_lock); + irq_insert_desc(start + i, desc); + mutex_unlock(&sparse_irq_lock); + } + return start; + +err: + for (i--; i >= 0; i--) + free_desc(start + i); + + mutex_lock(&sparse_irq_lock); + bitmap_clear(allocated_irqs, start, cnt); + mutex_unlock(&sparse_irq_lock); + return -ENOMEM; +} + +struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) +{ + int res = irq_alloc_descs(irq, irq, 1, node); + + if (res == -EEXIST || res == irq) + return irq_to_desc(irq); + return NULL; +} + +int __init early_irq_init(void) +{ + int i, initcnt, node = first_online_node; + struct irq_desc *desc; + + init_irq_default_affinity(); + + /* Let arch update nr_irqs and return the nr of preallocated irqs */ + initcnt = arch_probe_nr_irqs(); + printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); + + for (i = 0; i < initcnt; i++) { + desc = alloc_desc(i, node); + set_bit(i, allocated_irqs); + irq_insert_desc(i, desc); + } + return arch_early_irq_init(); +} + +#else /* !CONFIG_SPARSE_IRQ */ + +struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { + [0 ... NR_IRQS-1] = { + .status = IRQ_DEFAULT_INIT_FLAGS, + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), + } +}; + +static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS]; +int __init early_irq_init(void) +{ + int count, i, node = first_online_node; + struct irq_desc *desc; + + init_irq_default_affinity(); + + printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); + + desc = irq_desc; + count = ARRAY_SIZE(irq_desc); + + for (i = 0; i < count; i++) { + desc[i].irq_data.irq = i; + desc[i].irq_data.chip = &no_irq_chip; + desc[i].kstat_irqs = kstat_irqs_all[i]; + alloc_masks(desc + i, GFP_KERNEL, node); + desc_smp_init(desc + i, node); + lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); + } + return arch_early_irq_init(); +} + +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return (irq < NR_IRQS) ? irq_desc + irq : NULL; +} + +struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) +{ + return irq_to_desc(irq); +} + +static void free_desc(unsigned int irq) +{ + dynamic_irq_cleanup(irq); +} + +static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) +{ + return start; +} +#endif /* !CONFIG_SPARSE_IRQ */ + +/* Dynamic interrupt handling */ + +/** + * irq_free_descs - free irq descriptors + * @from: Start of descriptor range + * @cnt: Number of consecutive irqs to free + */ +void irq_free_descs(unsigned int from, unsigned int cnt) +{ + int i; + + if (from >= nr_irqs || (from + cnt) > nr_irqs) + return; + + for (i = 0; i < cnt; i++) + free_desc(from + i); + + mutex_lock(&sparse_irq_lock); + bitmap_clear(allocated_irqs, from, cnt); + mutex_unlock(&sparse_irq_lock); +} + +/** + * irq_alloc_descs - allocate and initialize a range of irq descriptors + * @irq: Allocate for specific irq number if irq >= 0 + * @from: Start the search from this irq number + * @cnt: Number of consecutive irqs to allocate. + * @node: Preferred node on which the irq descriptor should be allocated + * + * Returns the first irq number or error code + */ +int __ref +irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) +{ + int start, ret; + + if (!cnt) + return -EINVAL; + + mutex_lock(&sparse_irq_lock); + + start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); + ret = -EEXIST; + if (irq >=0 && start != irq) + goto err; + + ret = -ENOMEM; + if (start >= nr_irqs) + goto err; + + bitmap_set(allocated_irqs, start, cnt); + mutex_unlock(&sparse_irq_lock); + return alloc_descs(start, cnt, node); + +err: + mutex_unlock(&sparse_irq_lock); + return ret; +} + +/** + * irq_reserve_irqs - mark irqs allocated + * @from: mark from irq number + * @cnt: number of irqs to mark + * + * Returns 0 on success or an appropriate error code + */ +int irq_reserve_irqs(unsigned int from, unsigned int cnt) +{ + unsigned int start; + int ret = 0; + + if (!cnt || (from + cnt) > nr_irqs) + return -EINVAL; + + mutex_lock(&sparse_irq_lock); + start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); + if (start == from) + bitmap_set(allocated_irqs, start, cnt); + else + ret = -EEXIST; + mutex_unlock(&sparse_irq_lock); + return ret; +} + +/** + * irq_get_next_irq - get next allocated irq number + * @offset: where to start the search + * + * Returns next irq number after offset or nr_irqs if none is found. + */ +unsigned int irq_get_next_irq(unsigned int offset) +{ + return find_next_bit(allocated_irqs, nr_irqs, offset); +} + +/** + * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * @irq: irq number to initialize + */ +void dynamic_irq_cleanup(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc_set_defaults(irq, desc, desc_node(desc)); + raw_spin_unlock_irqrestore(&desc->lock, flags); +} + +unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) +{ + struct irq_desc *desc = irq_to_desc(irq); + return desc ? desc->kstat_irqs[cpu] : 0; +} diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c3003e9d91a..644e8d5fa36 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || - !desc->chip->set_affinity) + if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip || + !desc->irq_data.chip->irq_set_affinity) return 0; return 1; @@ -109,17 +109,18 @@ void irq_set_thread_affinity(struct irq_desc *desc) int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) { struct irq_desc *desc = irq_to_desc(irq); + struct irq_chip *chip = desc->irq_data.chip; unsigned long flags; - if (!desc->chip->set_affinity) + if (!chip->irq_set_affinity) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT) { - if (!desc->chip->set_affinity(irq, cpumask)) { - cpumask_copy(desc->affinity, cpumask); + if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { + cpumask_copy(desc->irq_data.affinity, cpumask); irq_set_thread_affinity(desc); } } @@ -128,8 +129,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) cpumask_copy(desc->pending_mask, cpumask); } #else - if (!desc->chip->set_affinity(irq, cpumask)) { - cpumask_copy(desc->affinity, cpumask); + if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) { + cpumask_copy(desc->irq_data.affinity, cpumask); irq_set_thread_affinity(desc); } #endif @@ -168,16 +169,16 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc) * one of the targets is online. */ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpumask_any_and(desc->affinity, cpu_online_mask) + if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask) < nr_cpu_ids) goto set_affinity; else desc->status &= ~IRQ_AFFINITY_SET; } - cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); + cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity); set_affinity: - desc->chip->set_affinity(irq, desc->affinity); + desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false); return 0; } @@ -223,7 +224,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) if (!desc->depth++) { desc->status |= IRQ_DISABLED; - desc->chip->disable(irq); + desc->irq_data.chip->irq_disable(&desc->irq_data); } } @@ -246,11 +247,11 @@ void disable_irq_nosync(unsigned int irq) if (!desc) return; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); __disable_irq(desc, irq, false); raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL(disable_irq_nosync); @@ -313,7 +314,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) * IRQ line is re-enabled. * * This function may be called from IRQ context only when - * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! + * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! */ void enable_irq(unsigned int irq) { @@ -323,11 +324,11 @@ void enable_irq(unsigned int irq) if (!desc) return; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, false); raw_spin_unlock_irqrestore(&desc->lock, flags); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL(enable_irq); @@ -336,8 +337,8 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; - if (desc->chip->set_wake) - ret = desc->chip->set_wake(irq, on); + if (desc->irq_data.chip->irq_set_wake) + ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); return ret; } @@ -429,12 +430,12 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc) } int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, - unsigned long flags) + unsigned long flags) { int ret; - struct irq_chip *chip = desc->chip; + struct irq_chip *chip = desc->irq_data.chip; - if (!chip || !chip->set_type) { + if (!chip || !chip->irq_set_type) { /* * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? @@ -445,11 +446,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, } /* caller masked out all except trigger mode flags */ - ret = chip->set_type(irq, flags); + ret = chip->irq_set_type(&desc->irq_data, flags); if (ret) - pr_err("setting trigger mode %d for irq %u failed (%pF)\n", - (int)flags, irq, chip->set_type); + pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", + flags, irq, chip->irq_set_type); else { if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) flags |= IRQ_LEVEL; @@ -457,8 +458,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); desc->status |= flags; - if (chip != desc->chip) - irq_chip_set_defaults(desc->chip); + if (chip != desc->irq_data.chip) + irq_chip_set_defaults(desc->irq_data.chip); } return ret; @@ -507,7 +508,7 @@ static int irq_wait_for_interrupt(struct irqaction *action) static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) { again: - chip_bus_lock(irq, desc); + chip_bus_lock(desc); raw_spin_lock_irq(&desc->lock); /* @@ -521,17 +522,17 @@ again: */ if (unlikely(desc->status & IRQ_INPROGRESS)) { raw_spin_unlock_irq(&desc->lock); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); cpu_relax(); goto again; } if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; - desc->chip->unmask(irq); + desc->irq_data.chip->irq_unmask(&desc->irq_data); } raw_spin_unlock_irq(&desc->lock); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } #ifdef CONFIG_SMP @@ -556,7 +557,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) } raw_spin_lock_irq(&desc->lock); - cpumask_copy(mask, desc->affinity); + cpumask_copy(mask, desc->irq_data.affinity); raw_spin_unlock_irq(&desc->lock); set_cpus_allowed_ptr(current, mask); @@ -657,7 +658,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!desc) return -EINVAL; - if (desc->chip == &no_irq_chip) + if (desc->irq_data.chip == &no_irq_chip) return -ENOSYS; /* * Some drivers like serial.c use request_irq() heavily, @@ -752,7 +753,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } if (!shared) { - irq_chip_set_defaults(desc->chip); + irq_chip_set_defaults(desc->irq_data.chip); init_waitqueue_head(&desc->wait_for_threads); @@ -779,7 +780,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!(desc->status & IRQ_NOAUTOEN)) { desc->depth = 0; desc->status &= ~IRQ_DISABLED; - desc->chip->startup(irq); + desc->irq_data.chip->irq_startup(&desc->irq_data); } else /* Undo nested disables: */ desc->depth = 1; @@ -912,17 +913,17 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Currently used only by UML, might disappear one day: */ #ifdef CONFIG_IRQ_RELEASE_METHOD - if (desc->chip->release) - desc->chip->release(irq, dev_id); + if (desc->irq_data.chip->release) + desc->irq_data.chip->release(irq, dev_id); #endif /* If this was the last handler, shut down the IRQ line: */ if (!desc->action) { desc->status |= IRQ_DISABLED; - if (desc->chip->shutdown) - desc->chip->shutdown(irq); + if (desc->irq_data.chip->irq_shutdown) + desc->irq_data.chip->irq_shutdown(&desc->irq_data); else - desc->chip->disable(irq); + desc->irq_data.chip->irq_disable(&desc->irq_data); } #ifdef CONFIG_SMP @@ -997,9 +998,9 @@ void free_irq(unsigned int irq, void *dev_id) if (!desc) return; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); kfree(__free_irq(irq, dev_id)); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); } EXPORT_SYMBOL(free_irq); @@ -1086,9 +1087,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, action->name = devname; action->dev_id = dev_id; - chip_bus_lock(irq, desc); + chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); - chip_bus_sync_unlock(irq, desc); + chip_bus_sync_unlock(desc); if (retval) kfree(action); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 24196228083..1d254194048 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -7,6 +7,7 @@ void move_masked_irq(int irq) { struct irq_desc *desc = irq_to_desc(irq); + struct irq_chip *chip = desc->irq_data.chip; if (likely(!(desc->status & IRQ_MOVE_PENDING))) return; @@ -24,7 +25,7 @@ void move_masked_irq(int irq) if (unlikely(cpumask_empty(desc->pending_mask))) return; - if (!desc->chip->set_affinity) + if (!chip->irq_set_affinity) return; assert_raw_spin_locked(&desc->lock); @@ -43,8 +44,9 @@ void move_masked_irq(int irq) */ if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)) - if (!desc->chip->set_affinity(irq, desc->pending_mask)) { - cpumask_copy(desc->affinity, desc->pending_mask); + if (!chip->irq_set_affinity(&desc->irq_data, + desc->pending_mask, false)) { + cpumask_copy(desc->irq_data.affinity, desc->pending_mask); irq_set_thread_affinity(desc); } @@ -61,8 +63,8 @@ void move_native_irq(int irq) if (unlikely(desc->status & IRQ_DISABLED)) return; - desc->chip->mask(irq); + desc->irq_data.chip->irq_mask(&desc->irq_data); move_masked_irq(irq); - desc->chip->unmask(irq); + desc->irq_data.chip->irq_unmask(&desc->irq_data); } diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c deleted file mode 100644 index 65d3845665a..00000000000 --- a/kernel/irq/numa_migrate.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * NUMA irq-desc migration code - * - * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to - * the new "home node" of the IRQ. - */ - -#include <linux/irq.h> -#include <linux/slab.h> -#include <linux/module.h> -#include <linux/random.h> -#include <linux/interrupt.h> -#include <linux/kernel_stat.h> - -#include "internals.h" - -static void init_copy_kstat_irqs(struct irq_desc *old_desc, - struct irq_desc *desc, - int node, int nr) -{ - init_kstat_irqs(desc, node, nr); - - if (desc->kstat_irqs != old_desc->kstat_irqs) - memcpy(desc->kstat_irqs, old_desc->kstat_irqs, - nr * sizeof(*desc->kstat_irqs)); -} - -static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) -{ - if (old_desc->kstat_irqs == desc->kstat_irqs) - return; - - kfree(old_desc->kstat_irqs); - old_desc->kstat_irqs = NULL; -} - -static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, - struct irq_desc *desc, int node) -{ - memcpy(desc, old_desc, sizeof(struct irq_desc)); - if (!alloc_desc_masks(desc, node, false)) { - printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " - "for migration.\n", irq); - return false; - } - raw_spin_lock_init(&desc->lock); - desc->node = node; - lockdep_set_class(&desc->lock, &irq_desc_lock_class); - init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); - init_copy_desc_masks(old_desc, desc); - arch_init_copy_chip_data(old_desc, desc, node); - return true; -} - -static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) -{ - free_kstat_irqs(old_desc, desc); - free_desc_masks(old_desc, desc); - arch_free_chip_data(old_desc, desc); -} - -static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, - int node) -{ - struct irq_desc *desc; - unsigned int irq; - unsigned long flags; - - irq = old_desc->irq; - - raw_spin_lock_irqsave(&sparse_irq_lock, flags); - - /* We have to check it to avoid races with another CPU */ - desc = irq_to_desc(irq); - - if (desc && old_desc != desc) - goto out_unlock; - - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - if (!desc) { - printk(KERN_ERR "irq %d: can not get new irq_desc " - "for migration.\n", irq); - /* still use old one */ - desc = old_desc; - goto out_unlock; - } - if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) { - /* still use old one */ - kfree(desc); - desc = old_desc; - goto out_unlock; - } - - replace_irq_desc(irq, desc); - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); - - /* free the old one */ - free_one_irq_desc(old_desc, desc); - kfree(old_desc); - - return desc; - -out_unlock: - raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); - - return desc; -} - -struct irq_desc *move_irq_desc(struct irq_desc *desc, int node) -{ - /* those static or target node is -1, do not move them */ - if (desc->irq < NR_IRQS_LEGACY || node == -1) - return desc; - - if (desc->node != node) - desc = __real_move_irq_desc(desc, node); - - return desc; -} - diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 09a2ee540bd..01b1d3a8898 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -21,7 +21,7 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - const struct cpumask *mask = desc->affinity; + const struct cpumask *mask = desc->irq_data.affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PENDING) @@ -65,7 +65,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, cpumask_var_t new_value; int err; - if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || + if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity || irq_balancing_disabled(irq)) return -EIO; @@ -185,7 +185,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long) m->private); - seq_printf(m, "%d\n", desc->node); + seq_printf(m, "%d\n", desc->irq_data.node); return 0; } @@ -269,7 +269,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) { char name [MAX_NAMELEN]; - if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) + if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) return; memset(name, 0, MAX_NAMELEN); @@ -297,6 +297,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) &irq_spurious_proc_fops, (void *)(long)irq); } +void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) +{ + char name [MAX_NAMELEN]; + + if (!root_irq_dir || !desc->dir) + return; +#ifdef CONFIG_SMP + remove_proc_entry("smp_affinity", desc->dir); + remove_proc_entry("affinity_hint", desc->dir); + remove_proc_entry("node", desc->dir); +#endif + remove_proc_entry("spurious", desc->dir); + + memset(name, 0, MAX_NAMELEN); + sprintf(name, "%u", irq); + remove_proc_entry(name, root_irq_dir); +} + #undef MAX_NAMELEN void unregister_handler_proc(unsigned int irq, struct irqaction *action) diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 090c3763f3a..891115a929a 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) /* * Make sure the interrupt is enabled, before resending it: */ - desc->chip->enable(irq); + desc->irq_data.chip->irq_enable(&desc->irq_data); /* * We do not resend level type interrupts. Level type @@ -70,7 +70,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; - if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) { + if (!desc->irq_data.chip->irq_retrigger || + !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { #ifdef CONFIG_HARDIRQS_SW_RESEND /* Set it pending and activate the softirq: */ set_bit(irq, irqs_resend); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 89fb90ae534..3089d3b9d5f 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -14,6 +14,8 @@ #include <linux/moduleparam.h> #include <linux/timer.h> +#include "internals.h" + static int irqfixup __read_mostly; #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) @@ -78,8 +80,8 @@ static int try_one_irq(int irq, struct irq_desc *desc) * If we did actual work for the real IRQ line we must let the * IRQ controller clean up too */ - if (work && desc->chip && desc->chip->end) - desc->chip->end(irq); + if (work) + irq_end(irq, desc); raw_spin_unlock(&desc->lock); return ok; @@ -254,7 +256,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, printk(KERN_EMERG "Disabling IRQ #%d\n", irq); desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; desc->depth++; - desc->chip->disable(irq); + desc->irq_data.chip->irq_disable(&desc->irq_data); mod_timer(&poll_spurious_irq_timer, jiffies + POLL_SPURIOUS_IRQ_INTERVAL); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f2852a51023..42ba65dff7d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } #endif + if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { + debug_locks_off(); + printk(KERN_ERR + "BUG: looking up invalid subclass: %u\n", subclass); + printk(KERN_ERR + "turning off the locking correctness validator.\n"); + dump_stack(); + return NULL; + } + /* * Static locks do not have their class-keys yet - for them the key * is the lock object itself: @@ -774,7 +784,9 @@ out_unlock_set: raw_local_irq_restore(flags); if (!subclass || force) - lock->class_cache = class; + lock->class_cache[0] = class; + else if (subclass < NR_LOCKDEP_CACHING_CLASSES) + lock->class_cache[subclass] = class; if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) return NULL; @@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - lock->class_cache = NULL; + int i; + + for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) + lock->class_cache[i] = NULL; + #ifdef CONFIG_LOCK_STAT lock->cpu = raw_smp_processor_id(); #endif @@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; - if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { - debug_locks_off(); - printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); - return 0; - } - if (lock->key == &__lockdep_no_validate__) check = 1; - if (!subclass) - class = lock->class_cache; + if (subclass < NR_LOCKDEP_CACHING_CLASSES) + class = lock->class_cache[subclass]; /* - * Not cached yet or subclass? + * Not cached? */ if (unlikely(!class)) { class = register_lock_class(lock, subclass, 0); @@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) return 1; if (hlock->references) { - struct lock_class *class = lock->class_cache; + struct lock_class *class = lock->class_cache[0]; if (!class) class = look_up_lock_class(lock, 0); @@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock) if (list_empty(head)) continue; list_for_each_entry_safe(class, next, head, hash_entry) { - if (unlikely(class == lock->class_cache)) { + int match = 0; + + for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) + match |= class == lock->class_cache[j]; + + if (unlikely(match)) { if (debug_locks_off_graph_unlock()) WARN_ON(1); goto out_restore; @@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks); * Careful: only use this function if you are sure that * the task cannot run in parallel! */ -void __debug_show_held_locks(struct task_struct *task) +void debug_show_held_locks(struct task_struct *task) { if (unlikely(!debug_locks)) { printk("INFO: lockdep is turned off.\n"); @@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task) } lockdep_print_held_locks(task); } -EXPORT_SYMBOL_GPL(__debug_show_held_locks); - -void debug_show_held_locks(struct task_struct *task) -{ - __debug_show_held_locks(task); -} EXPORT_SYMBOL_GPL(debug_show_held_locks); void lockdep_sys_exit(void) diff --git a/kernel/pid.c b/kernel/pid.c index d55c6fb8d08..39b65b69584 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) struct task_struct *result = NULL; if (pid) { struct hlist_node *first; - first = rcu_dereference_check(pid->tasks[type].first, + first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), rcu_read_lock_held() || lockdep_tasklist_lock_is_held()); if (first) @@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { + rcu_lockdep_assert(rcu_read_lock_held()); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } diff --git a/kernel/printk.c b/kernel/printk.c index 8fe465ac008..2531017795f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress); * provides serialisation for access to the entire console * driver system. */ -static DECLARE_MUTEX(console_sem); +static DEFINE_SEMAPHORE(console_sem); struct console *console_drivers; EXPORT_SYMBOL_GPL(console_drivers); @@ -556,7 +556,7 @@ static void zap_locks(void) /* If a crash is occurring, make sure we can't deadlock */ spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ - init_MUTEX(&console_sem); + sema_init(&console_sem, 1); } #if defined(CONFIG_PRINTK_TIME) diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4d169835fb3..a23a57a976d 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void) EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); /** - * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? + * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? * * Check for bottom half being disabled, which covers both the * CONFIG_PROVE_RCU and not cases. Note that if someone uses * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) - * will show the situation. + * will show the situation. This is useful for debug checks in functions + * that require that they be called within an RCU read-side critical + * section. * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. */ @@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void) { if (!debug_lockdep_rcu_enabled()) return 1; - return in_softirq(); + return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 196ec02f8be..d806735342a 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +/* Forward declarations for rcutiny_plugin.h. */ +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); +static void __call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu), + struct rcu_ctrlblk *rcp); + +#include "rcutiny_plugin.h" + #ifdef CONFIG_NO_HZ static long rcu_dynticks_nesting = 1; @@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user) rcu_sched_qs(cpu); else if (!in_softirq()) rcu_bh_qs(cpu); + rcu_preempt_check_callbacks(); } /* @@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) *rcp->donetail = NULL; if (rcp->curtail == rcp->donetail) rcp->curtail = &rcp->rcucblist; + rcu_preempt_remove_callbacks(rcp); rcp->donetail = &rcp->rcucblist; local_irq_restore(flags); @@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) { __rcu_process_callbacks(&rcu_sched_ctrlblk); __rcu_process_callbacks(&rcu_bh_ctrlblk); + rcu_preempt_process_callbacks(); } /* @@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head, } /* - * Post an RCU callback to be invoked after the end of an RCU grace + * Post an RCU callback to be invoked after the end of an RCU-sched grace * period. But since we have but one CPU, that would be after any * quiescent state. */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { __call_rcu(head, func, &rcu_sched_ctrlblk); } -EXPORT_SYMBOL_GPL(call_rcu); +EXPORT_SYMBOL_GPL(call_rcu_sched); /* * Post an RCU bottom-half callback to be invoked after any subsequent @@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu_bh); -void rcu_barrier(void) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - void rcu_barrier_bh(void) { struct rcu_synchronize rcu; @@ -289,5 +286,3 @@ void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); } - -#include "rcutiny_plugin.h" diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index d223a92bc74..6ceca4f745f 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -1,7 +1,7 @@ /* - * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition * Internal non-public definitions that provide either classic - * or preemptable semantics. + * or preemptible semantics. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -17,11 +17,587 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * - * Copyright IBM Corporation, 2009 + * Copyright (c) 2010 Linaro * * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> */ +#ifdef CONFIG_TINY_PREEMPT_RCU + +#include <linux/delay.h> + +/* Global control variables for preemptible RCU. */ +struct rcu_preempt_ctrlblk { + struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ + struct rcu_head **nexttail; + /* Tasks blocked in a preemptible RCU */ + /* read-side critical section while an */ + /* preemptible-RCU grace period is in */ + /* progress must wait for a later grace */ + /* period. This pointer points to the */ + /* ->next pointer of the last task that */ + /* must wait for a later grace period, or */ + /* to &->rcb.rcucblist if there is no */ + /* such task. */ + struct list_head blkd_tasks; + /* Tasks blocked in RCU read-side critical */ + /* section. Tasks are placed at the head */ + /* of this list and age towards the tail. */ + struct list_head *gp_tasks; + /* Pointer to the first task blocking the */ + /* current grace period, or NULL if there */ + /* is not such task. */ + struct list_head *exp_tasks; + /* Pointer to first task blocking the */ + /* current expedited grace period, or NULL */ + /* if there is no such task. If there */ + /* is no current expedited grace period, */ + /* then there cannot be any such task. */ + u8 gpnum; /* Current grace period. */ + u8 gpcpu; /* Last grace period blocked by the CPU. */ + u8 completed; /* Last grace period completed. */ + /* If all three are equal, RCU is idle. */ +}; + +static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { + .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), +}; + +static int rcu_preempted_readers_exp(void); +static void rcu_report_exp_done(void); + +/* + * Return true if the CPU has not yet responded to the current grace period. + */ +static int rcu_cpu_blocking_cur_gp(void) +{ + return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; +} + +/* + * Check for a running RCU reader. Because there is only one CPU, + * there can be but one running RCU reader at a time. ;-) + */ +static int rcu_preempt_running_reader(void) +{ + return current->rcu_read_lock_nesting; +} + +/* + * Check for preempted RCU readers blocking any grace period. + * If the caller needs a reliable answer, it must disable hard irqs. + */ +static int rcu_preempt_blocked_readers_any(void) +{ + return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); +} + +/* + * Check for preempted RCU readers blocking the current grace period. + * If the caller needs a reliable answer, it must disable hard irqs. + */ +static int rcu_preempt_blocked_readers_cgp(void) +{ + return rcu_preempt_ctrlblk.gp_tasks != NULL; +} + +/* + * Return true if another preemptible-RCU grace period is needed. + */ +static int rcu_preempt_needs_another_gp(void) +{ + return *rcu_preempt_ctrlblk.rcb.curtail != NULL; +} + +/* + * Return true if a preemptible-RCU grace period is in progress. + * The caller must disable hardirqs. + */ +static int rcu_preempt_gp_in_progress(void) +{ + return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; +} + +/* + * Record a preemptible-RCU quiescent state for the specified CPU. Note + * that this just means that the task currently running on the CPU is + * in a quiescent state. There might be any number of tasks blocked + * while in an RCU read-side critical section. + * + * Unlike the other rcu_*_qs() functions, callers to this function + * must disable irqs in order to protect the assignment to + * ->rcu_read_unlock_special. + * + * Because this is a single-CPU implementation, the only way a grace + * period can end is if the CPU is in a quiescent state. The reason is + * that a blocked preemptible-RCU reader can exit its critical section + * only if the CPU is running it at the time. Therefore, when the + * last task blocking the current grace period exits its RCU read-side + * critical section, neither the CPU nor blocked tasks will be stopping + * the current grace period. (In contrast, SMP implementations + * might have CPUs running in RCU read-side critical sections that + * block later grace periods -- but this is not possible given only + * one CPU.) + */ +static void rcu_preempt_cpu_qs(void) +{ + /* Record both CPU and task as having responded to current GP. */ + rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; + current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + + /* + * If there is no GP, or if blocked readers are still blocking GP, + * then there is nothing more to do. + */ + if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp()) + return; + + /* Advance callbacks. */ + rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; + rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; + rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; + + /* If there are no blocked readers, next GP is done instantly. */ + if (!rcu_preempt_blocked_readers_any()) + rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; + + /* If there are done callbacks, make RCU_SOFTIRQ process them. */ + if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) + raise_softirq(RCU_SOFTIRQ); +} + +/* + * Start a new RCU grace period if warranted. Hard irqs must be disabled. + */ +static void rcu_preempt_start_gp(void) +{ + if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { + + /* Official start of GP. */ + rcu_preempt_ctrlblk.gpnum++; + + /* Any blocked RCU readers block new GP. */ + if (rcu_preempt_blocked_readers_any()) + rcu_preempt_ctrlblk.gp_tasks = + rcu_preempt_ctrlblk.blkd_tasks.next; + + /* If there is no running reader, CPU is done with GP. */ + if (!rcu_preempt_running_reader()) + rcu_preempt_cpu_qs(); + } +} + +/* + * We have entered the scheduler, and the current task might soon be + * context-switched away from. If this task is in an RCU read-side + * critical section, we will no longer be able to rely on the CPU to + * record that fact, so we enqueue the task on the blkd_tasks list. + * If the task started after the current grace period began, as recorded + * by ->gpcpu, we enqueue at the beginning of the list. Otherwise + * before the element referenced by ->gp_tasks (or at the tail if + * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. + * The task will dequeue itself when it exits the outermost enclosing + * RCU read-side critical section. Therefore, the current grace period + * cannot be permitted to complete until the ->gp_tasks pointer becomes + * NULL. + * + * Caller must disable preemption. + */ +void rcu_preempt_note_context_switch(void) +{ + struct task_struct *t = current; + unsigned long flags; + + local_irq_save(flags); /* must exclude scheduler_tick(). */ + if (rcu_preempt_running_reader() && + (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { + + /* Possibly blocking in an RCU read-side critical section. */ + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; + + /* + * If this CPU has already checked in, then this task + * will hold up the next grace period rather than the + * current grace period. Queue the task accordingly. + * If the task is queued for the current grace period + * (i.e., this CPU has not yet passed through a quiescent + * state for the current grace period), then as long + * as that task remains queued, the current grace period + * cannot end. + */ + list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); + if (rcu_cpu_blocking_cur_gp()) + rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; + } + + /* + * Either we were not in an RCU read-side critical section to + * begin with, or we have now recorded that critical section + * globally. Either way, we can now note a quiescent state + * for this CPU. Again, if we were in an RCU read-side critical + * section, and if that critical section was blocking the current + * grace period, then the fact that the task has been enqueued + * means that current grace period continues to be blocked. + */ + rcu_preempt_cpu_qs(); + local_irq_restore(flags); +} + +/* + * Tiny-preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Handle special cases during rcu_read_unlock(), such as needing to + * notify RCU core processing or task having blocked during the RCU + * read-side critical section. + */ +static void rcu_read_unlock_special(struct task_struct *t) +{ + int empty; + int empty_exp; + unsigned long flags; + struct list_head *np; + int special; + + /* + * NMI handlers cannot block and cannot safely manipulate state. + * They therefore cannot possibly be special, so just leave. + */ + if (in_nmi()) + return; + + local_irq_save(flags); + + /* + * If RCU core is waiting for this CPU to exit critical section, + * let it know that we have done so. + */ + special = t->rcu_read_unlock_special; + if (special & RCU_READ_UNLOCK_NEED_QS) + rcu_preempt_cpu_qs(); + + /* Hardware IRQ handlers cannot block. */ + if (in_irq()) { + local_irq_restore(flags); + return; + } + + /* Clean up if blocked during RCU read-side critical section. */ + if (special & RCU_READ_UNLOCK_BLOCKED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; + + /* + * Remove this task from the ->blkd_tasks list and adjust + * any pointers that might have been referencing it. + */ + empty = !rcu_preempt_blocked_readers_cgp(); + empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; + np = t->rcu_node_entry.next; + if (np == &rcu_preempt_ctrlblk.blkd_tasks) + np = NULL; + list_del(&t->rcu_node_entry); + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) + rcu_preempt_ctrlblk.gp_tasks = np; + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) + rcu_preempt_ctrlblk.exp_tasks = np; + INIT_LIST_HEAD(&t->rcu_node_entry); + + /* + * If this was the last task on the current list, and if + * we aren't waiting on the CPU, report the quiescent state + * and start a new grace period if needed. + */ + if (!empty && !rcu_preempt_blocked_readers_cgp()) { + rcu_preempt_cpu_qs(); + rcu_preempt_start_gp(); + } + + /* + * If this was the last task on the expedited lists, + * then we need wake up the waiting task. + */ + if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) + rcu_report_exp_done(); + } + local_irq_restore(flags); +} + +/* + * Tiny-preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ + --t->rcu_read_lock_nesting; + barrier(); /* decrement before load of ->rcu_read_unlock_special */ + if (t->rcu_read_lock_nesting == 0 && + unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); +#ifdef CONFIG_PROVE_LOCKING + WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +/* + * Check for a quiescent state from the current CPU. When a task blocks, + * the task is recorded in the rcu_preempt_ctrlblk structure, which is + * checked elsewhere. This is called from the scheduling-clock interrupt. + * + * Caller must disable hard irqs. + */ +static void rcu_preempt_check_callbacks(void) +{ + struct task_struct *t = current; + + if (rcu_preempt_gp_in_progress() && + (!rcu_preempt_running_reader() || + !rcu_cpu_blocking_cur_gp())) + rcu_preempt_cpu_qs(); + if (&rcu_preempt_ctrlblk.rcb.rcucblist != + rcu_preempt_ctrlblk.rcb.donetail) + raise_softirq(RCU_SOFTIRQ); + if (rcu_preempt_gp_in_progress() && + rcu_cpu_blocking_cur_gp() && + rcu_preempt_running_reader()) + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; +} + +/* + * TINY_PREEMPT_RCU has an extra callback-list tail pointer to + * update, so this is invoked from __rcu_process_callbacks() to + * handle that case. Of course, it is invoked for all flavors of + * RCU, but RCU callbacks can appear only on one of the lists, and + * neither ->nexttail nor ->donetail can possibly be NULL, so there + * is no need for an explicit check. + */ +static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) +{ + if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) + rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; +} + +/* + * Process callbacks for preemptible RCU. + */ +static void rcu_preempt_process_callbacks(void) +{ + __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); +} + +/* + * Queue a preemptible -RCU callback for invocation after a grace period. + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + + debug_rcu_head_queue(head); + head->func = func; + head->next = NULL; + + local_irq_save(flags); + *rcu_preempt_ctrlblk.nexttail = head; + rcu_preempt_ctrlblk.nexttail = &head->next; + rcu_preempt_start_gp(); /* checks to see if GP needed. */ + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(call_rcu); + +void rcu_barrier(void) +{ + struct rcu_synchronize rcu; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/* + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +void synchronize_rcu(void) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (!rcu_scheduler_active) + return; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + + WARN_ON_ONCE(rcu_preempt_running_reader()); + if (!rcu_preempt_blocked_readers_any()) + return; + + /* Once we get past the fastpath checks, same code as rcu_barrier(). */ + rcu_barrier(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + +static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); +static unsigned long sync_rcu_preempt_exp_count; +static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); + +/* + * Return non-zero if there are any tasks in RCU read-side critical + * sections blocking the current preemptible-RCU expedited grace period. + * If there is no preemptible-RCU expedited grace period currently in + * progress, returns zero unconditionally. + */ +static int rcu_preempted_readers_exp(void) +{ + return rcu_preempt_ctrlblk.exp_tasks != NULL; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. + */ +static void rcu_report_exp_done(void) +{ + wake_up(&sync_rcu_preempt_exp_wq); +} + +/* + * Wait for an rcu-preempt grace period, but expedite it. The basic idea + * is to rely in the fact that there is but one CPU, and that it is + * illegal for a task to invoke synchronize_rcu_expedited() while in a + * preemptible-RCU read-side critical section. Therefore, any such + * critical sections must correspond to blocked tasks, which must therefore + * be on the ->blkd_tasks list. So just record the current head of the + * list in the ->exp_tasks pointer, and wait for all tasks including and + * after the task pointed to by ->exp_tasks to drain. + */ +void synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; + unsigned long snap; + + barrier(); /* ensure prior action seen before grace period. */ + + WARN_ON_ONCE(rcu_preempt_running_reader()); + + /* + * Acquire lock so that there is only one preemptible RCU grace + * period in flight. Of course, if someone does the expedited + * grace period for us while we are acquiring the lock, just leave. + */ + snap = sync_rcu_preempt_exp_count + 1; + mutex_lock(&sync_rcu_preempt_exp_mutex); + if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) + goto unlock_mb_ret; /* Others did our work for us. */ + + local_irq_save(flags); + + /* + * All RCU readers have to already be on blkd_tasks because + * we cannot legally be executing in an RCU read-side critical + * section. + */ + + /* Snapshot current head of ->blkd_tasks list. */ + rpcp->exp_tasks = rpcp->blkd_tasks.next; + if (rpcp->exp_tasks == &rpcp->blkd_tasks) + rpcp->exp_tasks = NULL; + local_irq_restore(flags); + + /* Wait for tail of ->blkd_tasks list to drain. */ + if (rcu_preempted_readers_exp()) + wait_event(sync_rcu_preempt_exp_wq, + !rcu_preempted_readers_exp()); + + /* Clean up and exit. */ + barrier(); /* ensure expedited GP seen before counter increment. */ + sync_rcu_preempt_exp_count++; +unlock_mb_ret: + mutex_unlock(&sync_rcu_preempt_exp_mutex); + barrier(); /* ensure subsequent action seen after grace period. */ +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* + * Does preemptible RCU need the CPU to stay out of dynticks mode? + */ +int rcu_preempt_needs_cpu(void) +{ + if (!rcu_preempt_running_reader()) + rcu_preempt_cpu_qs(); + return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; +} + +/* + * Check for a task exiting while in a preemptible -RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting == 0) + return; + t->rcu_read_lock_nesting = 1; + rcu_read_unlock(); +} + +#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to check. + */ +static void rcu_preempt_check_callbacks(void) +{ +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to remove. + */ +static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) +{ +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to process. + */ +static void rcu_preempt_process_callbacks(void) +{ +} + +#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC #include <linux/kernel_stat.h> diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 2e2726d790b..9d8e8fb2515 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -120,7 +120,7 @@ struct rcu_torture { }; static LIST_HEAD(rcu_torture_freelist); -static struct rcu_torture *rcu_torture_current; +static struct rcu_torture __rcu *rcu_torture_current; static long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); @@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ static int fullstop = FULLSTOP_RMMOD; -DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ - /* of kthreads. */ +/* + * Protect fullstop transitions and spawning of kthreads. + */ +static DEFINE_MUTEX(fullstop_mutex); /* * Detect and respond to a system shutdown. @@ -303,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp) mdelay(longdelay_ms); if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) + preempt_schedule(); /* No QS if preempt_disable() in effect */ +#endif } static void rcu_torture_read_unlock(int idx) __releases(RCU) @@ -536,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp) delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); if (!delay) schedule_timeout_interruptible(longdelay); + else + rcu_read_delay(rrsp); } static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) @@ -731,7 +739,8 @@ rcu_torture_writer(void *arg) continue; rp->rtort_pipe_count = 0; udelay(rcu_random(&rand) & 0x3ff); - old_rp = rcu_torture_current; + old_rp = rcu_dereference_check(rcu_torture_current, + current == writer_task); rp->rtort_mbtest = 1; rcu_assign_pointer(rcu_torture_current, rp); smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d5bc43976c5..ccdc04c4798 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -143,6 +143,11 @@ module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; +module_param(rcu_cpu_stall_suppress, int, 0644); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) #ifdef CONFIG_RCU_CPU_STALL_DETECTOR -int rcu_cpu_stall_panicking __read_mostly; +int rcu_cpu_stall_suppress __read_mostly; static void record_gp_stall_check_time(struct rcu_state *rsp) { @@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); - /* OK, time to rat on our buddy... */ - + /* + * OK, time to rat on our buddy... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", rsp->name); rcu_for_each_leaf_node(rsp, rnp) { @@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp) unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); + /* + * OK, time to rat on ourselves... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", rsp->name, smp_processor_id(), jiffies - rsp->gp_start); trigger_all_cpu_backtrace(); @@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) long delta; struct rcu_node *rnp; - if (rcu_cpu_stall_panicking) + if (rcu_cpu_stall_suppress) return; - delta = jiffies - rsp->jiffies_stall; + delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall); rnp = rdp->mynode; - if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { + if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) { /* We haven't checked in, so go dump stack. */ print_cpu_stall(rsp); @@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) { - rcu_cpu_stall_panicking = 1; + rcu_cpu_stall_suppress = 1; return NOTIFY_DONE; } +/** + * rcu_cpu_stall_reset - prevent further stall warnings in current grace period + * + * Set the stall-warning timeout way off into the future, thus preventing + * any RCU CPU stall-warning messages from appearing in the current set of + * RCU grace periods. + * + * The caller must disable hard irqs. + */ +void rcu_cpu_stall_reset(void) +{ + rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; + rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; + rcu_preempt_stall_reset(); +} + static struct notifier_block rcu_panic_block = { .notifier_call = rcu_panic, }; @@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { } +void rcu_cpu_stall_reset(void) +{ +} + static void __init check_cpu_stall_init(void) { } @@ -712,7 +745,7 @@ static void rcu_start_gp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { - struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { @@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) { int i; - struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); if (rdp->nxtlist == NULL) return; /* irqs disabled, so comparison is stable. */ @@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; rsp->orphan_qlen += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; rdp->qlen = 0; raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ } @@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) struct rcu_data *rdp; raw_spin_lock_irqsave(&rsp->onofflock, flags); - rdp = rsp->rda[smp_processor_id()]; + rdp = this_cpu_ptr(rsp->rda); if (rsp->orphan_cbs_list == NULL) { raw_spin_unlock_irqrestore(&rsp->onofflock, flags); return; @@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; rdp->qlen += rsp->orphan_qlen; + rdp->n_cbs_adopted += rsp->orphan_qlen; rsp->orphan_cbs_list = NULL; rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; rsp->orphan_qlen = 0; @@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) unsigned long flags; unsigned long mask; int need_report = 0; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp; /* Exclude any attempts to start a new grace period. */ @@ -1123,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* Update count, and requeue any remaining callbacks. */ rdp->qlen -= count; + rdp->n_cbs_invoked += count; if (list != NULL) { *tail = rdp->nxtlist; rdp->nxtlist = list; @@ -1226,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) cpu = rnp->grplo; bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { - if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) + if ((rnp->qsmask & bit) != 0 && + f(per_cpu_ptr(rsp->rda, cpu))) mask |= bit; } if (mask != 0) { @@ -1402,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * a quiescent state betweentimes. */ local_irq_save(flags); - rdp = rsp->rda[smp_processor_id()]; + rdp = this_cpu_ptr(rsp->rda); rcu_process_gp_end(rsp, rdp); check_for_new_grace_period(rsp, rdp); @@ -1701,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) { unsigned long flags; int i; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ @@ -1729,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) { unsigned long flags; unsigned long mask; - struct rcu_data *rdp = rsp->rda[cpu]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ @@ -1865,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) /* * Helper function for rcu_init() that initializes one rcu_state structure. */ -static void __init rcu_init_one(struct rcu_state *rsp) +static void __init rcu_init_one(struct rcu_state *rsp, + struct rcu_data __percpu *rda) { static char *buf[] = { "rcu_node_level_0", "rcu_node_level_1", @@ -1918,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp) } } + rsp->rda = rda; rnp = rsp->level[NUM_RCU_LVLS - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) rnp++; - rsp->rda[i]->mynode = rnp; + per_cpu_ptr(rsp->rda, i)->mynode = rnp; rcu_boot_init_percpu_data(i, rsp); } } -/* - * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used - * nowhere else! Assigns leaf node pointers into each CPU's rcu_data - * structure. - */ -#define RCU_INIT_FLAVOR(rsp, rcu_data) \ -do { \ - int i; \ - \ - for_each_possible_cpu(i) { \ - (rsp)->rda[i] = &per_cpu(rcu_data, i); \ - } \ - rcu_init_one(rsp); \ -} while (0) - void __init rcu_init(void) { int cpu; rcu_bootup_announce(); - RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); - RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); + rcu_init_one(&rcu_sched_state, &rcu_sched_data); + rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 14c040b18ed..91d4170c5c1 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -202,6 +202,9 @@ struct rcu_data { long qlen; /* # of queued callbacks */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ + unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */ + unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */ unsigned long n_force_qs_snap; /* did other CPU force QS recently? */ long blimit; /* Upper limit on a processed batch */ @@ -254,19 +257,23 @@ struct rcu_data { #define RCU_STALL_DELAY_DELTA 0 #endif -#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) +#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \ + RCU_STALL_DELAY_DELTA) /* for rsp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) +#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30) /* for rsp->jiffies_stall */ #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ /* to take at least one */ /* scheduling clock irq */ /* before ratting on them. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE +#define RCU_CPU_STALL_SUPPRESS_INIT 0 +#else +#define RCU_CPU_STALL_SUPPRESS_INIT 1 +#endif -#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) -#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* * RCU global state, including node hierarchy. This hierarchy is @@ -283,7 +290,7 @@ struct rcu_state { struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ - struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ + struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ /* The following fields are guarded by the root rcu_node's lock. */ @@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, #ifdef CONFIG_RCU_CPU_STALL_DETECTOR static void rcu_print_detail_task_stall(struct rcu_state *rsp); static void rcu_print_task_stall(struct rcu_node *rnp); +static void rcu_preempt_stall_reset(void); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0e4f420245d..71a4147473f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tRCU-based detection of stalled CPUs is disabled.\n"); #endif -#ifndef CONFIG_RCU_CPU_STALL_VERBOSE +#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); #endif #if NUM_RCU_LVL_4 != 0 @@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = rcu_preempt_state.rda[cpu]; + rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu) */ void __rcu_read_lock(void) { - ACCESS_ONCE(current->rcu_read_lock_nesting)++; + current->rcu_read_lock_nesting++; barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ } EXPORT_SYMBOL_GPL(__rcu_read_lock); @@ -344,7 +344,9 @@ void __rcu_read_unlock(void) struct task_struct *t = current; barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ - if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && + --t->rcu_read_lock_nesting; + barrier(); /* decrement before load of ->rcu_read_unlock_special */ + if (t->rcu_read_lock_nesting == 0 && unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) rcu_read_unlock_special(t); #ifdef CONFIG_PROVE_LOCKING @@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp) } } +/* + * Suppress preemptible RCU's CPU stall warnings by pushing the + * time of the next stall-warning message comfortably far into the + * future. + */ +static void rcu_preempt_stall_reset(void) +{ + rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; +} + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* @@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu); * * Control will return to the caller some time after a full grace * period has elapsed, in other words after all currently executing RCU - * read-side critical sections have completed. RCU read-side critical - * sections are delimited by rcu_read_lock() and rcu_read_unlock(), - * and may be nested. + * read-side critical sections have completed. Note, however, that + * upon return from synchronize_rcu(), the caller might well be executing + * concurrently with new RCU read-side critical sections that began while + * synchronize_rcu() was waiting. RCU read-side critical sections are + * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. */ void synchronize_rcu(void) { @@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void) */ static void __init __rcu_init_preempt(void) { - RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); + rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); } /* @@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp) { } +/* + * Because preemptible RCU does not exist, there is no need to suppress + * its CPU stall warnings. + */ +static void rcu_preempt_stall_reset(void) +{ +} + #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* @@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void) } /* - * In classic RCU, call_rcu() is just call_rcu_sched(). - */ -void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) -{ - call_rcu_sched(head, func); -} -EXPORT_SYMBOL_GPL(call_rcu); - -/* * Wait for an rcu-preempt grace period, but make it happen quickly. * But because preemptable RCU does not exist, map to rcu-sched. */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 36c95b45738..d15430b9d12 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); + seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit); + seq_printf(m, " ci=%lu co=%lu ca=%lu\n", + rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } #define PRINT_RCU_DATA(name, func, m) \ @@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); #endif /* #ifdef CONFIG_NO_HZ */ seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); + seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit); + seq_printf(m, ",%lu,%lu,%lu\n", + rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); } static int show_rcudata_csv(struct seq_file *m, void *unused) @@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) #ifdef CONFIG_NO_HZ seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ - seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); + seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n"); #ifdef CONFIG_TREE_PREEMPT_RCU seq_puts(m, "\"rcu_preempt:\"\n"); PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); @@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) struct rcu_data *rdp; for_each_possible_cpu(cpu) { - rdp = rsp->rda[cpu]; + rdp = per_cpu_ptr(rsp->rda, cpu); if (rdp->beenonline) print_one_rcu_pending(m, rdp); } diff --git a/kernel/sched.c b/kernel/sched.c index c0d2067f3e0..d42992bccdf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -426,9 +426,7 @@ struct root_domain { */ cpumask_var_t rto_mask; atomic_t rto_count; -#ifdef CONFIG_SMP struct cpupri cpupri; -#endif }; /* @@ -437,7 +435,7 @@ struct root_domain { */ static struct root_domain def_root_domain; -#endif +#endif /* CONFIG_SMP */ /* * This is the main, per-CPU runqueue data structure. @@ -488,11 +486,12 @@ struct rq { */ unsigned long nr_uninterruptible; - struct task_struct *curr, *idle; + struct task_struct *curr, *idle, *stop; unsigned long next_balance; struct mm_struct *prev_mm; u64 clock; + u64 clock_task; atomic_t nr_iowait; @@ -520,6 +519,10 @@ struct rq { u64 avg_idle; #endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif + /* calc_load related fields */ unsigned long calc_load_update; long calc_load_active; @@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p) #endif /* CONFIG_CGROUP_SCHED */ +static u64 irq_time_cpu(int cpu); +static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); + inline void update_rq_clock(struct rq *rq) { - if (!rq->skip_clock_update) - rq->clock = sched_clock_cpu(cpu_of(rq)); + if (!rq->skip_clock_update) { + int cpu = cpu_of(rq); + u64 irq_time; + + rq->clock = sched_clock_cpu(cpu); + irq_time = irq_time_cpu(cpu); + if (rq->clock - irq_time > rq->clock_task) + rq->clock_task = rq->clock - irq_time; + + sched_irq_time_avg_update(rq, irq_time); + } } /* @@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; - char *cmp = buf; + char *cmp; int neg = 0; int i; @@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, return -EFAULT; buf[cnt] = 0; + cmp = strstrip(buf); if (strncmp(buf, "NO_", 3) == 0) { neg = 1; @@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, } for (i = 0; sched_feat_names[i]; i++) { - int len = strlen(sched_feat_names[i]); - - if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { if (neg) sysctl_sched_features &= ~(1UL << i); else @@ -1840,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) static const struct sched_class rt_sched_class; -#define sched_class_highest (&rt_sched_class) +#define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1858,12 +1872,6 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { - if (task_has_rt_policy(p)) { - p->se.load.weight = 0; - p->se.load.inv_weight = WMULT_CONST; - return; - } - /* * SCHED_IDLE tasks get minimal weight: */ @@ -1917,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) dec_nr_running(rq); } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value (or semi updated value on 32 bit) with a side effect of + * accounting a slice of irq time to wrong task when irq is in progress + * while we read rq->clock. That is a worthy compromise in place of having + * locks on each irq in account_system_time. + */ +static DEFINE_PER_CPU(u64, cpu_hardirq_time); +static DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +static u64 irq_time_cpu(int cpu) +{ + if (!sched_clock_irqtime) + return 0; + + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} + +void account_system_vtime(struct task_struct *curr) +{ + unsigned long flags; + int cpu; + u64 now, delta; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + now = sched_clock_cpu(cpu); + delta = now - per_cpu(irq_start_time, cpu); + per_cpu(irq_start_time, cpu) = now; + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + per_cpu(cpu_hardirq_time, cpu) += delta; + else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) + per_cpu(cpu_softirq_time, cpu) += delta; + + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(account_system_vtime); + +static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) +{ + if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { + u64 delta_irq = curr_irq_time - rq->prev_irq_time; + rq->prev_irq_time = curr_irq_time; + sched_rt_avg_update(rq, delta_irq); + } +} + +#else + +static u64 irq_time_cpu(int cpu) +{ + return 0; +} + +static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } + +#endif + #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" +#include "sched_stoptask.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" #endif +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling class so that + * it can die in pieces. + */ + old_stop->sched_class = &rt_sched_class; + } +} + /* * __normal_prio - return the priority that is based on the static prio */ @@ -2003,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) if (p->sched_class != &fair_sched_class) return 0; + if (unlikely(p->policy == SCHED_IDLE)) + return 0; + /* * Buddy candidates are cache hot: */ @@ -2852,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); - if (likely(!mm)) { + if (!mm) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); - if (likely(!prev->mm)) { + if (!prev->mm) { prev->active_mm = NULL; rq->prev_mm = oldmm; } @@ -3248,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) if (task_current(rq, p)) { update_rq_clock(rq); - ns = rq->clock - p->se.exec_start; + ns = rq->clock_task - p->se.exec_start; if ((s64)ns < 0) ns = 0; } @@ -3397,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, tmp = cputime_to_cputime64(cputime); if (hardirq_count() - hardirq_offset) cpustat->irq = cputime64_add(cpustat->irq, tmp); - else if (softirq_count()) + else if (in_serving_softirq()) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); else cpustat->system = cputime64_add(cpustat->system, tmp); @@ -3723,17 +3853,13 @@ pick_next_task(struct rq *rq) return p; } - class = sched_class_highest; - for ( ; ; ) { + for_each_class(class) { p = class->pick_next_task(rq); if (p) return p; - /* - * Will never be NULL as the idle class always - * returns a non-NULL p: - */ - class = class->next; } + + BUG(); /* the idle class will always have a runnable task */ } /* @@ -4358,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) rq = task_rq_lock(p, &flags); + trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; on_rq = p->se.on_rq; @@ -4645,7 +4772,7 @@ recheck: } if (user) { - retval = security_task_setscheduler(p, policy, param); + retval = security_task_setscheduler(p); if (retval) return retval; } @@ -4661,6 +4788,15 @@ recheck: */ rq = __task_rq_lock(p); + /* + * Changing the policy of the stop threads its a very bad idea + */ + if (p == rq->stop) { + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return -EINVAL; + } + #ifdef CONFIG_RT_GROUP_SCHED if (user) { /* @@ -4887,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) goto out_unlock; - retval = security_task_setscheduler(p, 0, NULL); + retval = security_task_setscheduler(p); if (retval) goto out_unlock; cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, in_mask, cpus_allowed); - again: +again: retval = set_cpus_allowed_ptr(p, new_mask); if (!retval) { @@ -5337,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->se.exec_start = sched_clock(); cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); + /* + * We're having a chicken and egg problem, even though we are + * holding rq->lock, the cpu isn't yet set to this cpu so the + * lockdep check in task_group() will fail. + * + * Similar case to sched_fork(). / Alternatively we could + * use task_rq_lock() here and obtain the other rq->lock. + * + * Silence PROVE_RCU + */ + rcu_read_lock(); __set_task_cpu(idle, cpu); + rcu_read_unlock(); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) @@ -6514,6 +6662,7 @@ struct s_data { cpumask_var_t nodemask; cpumask_var_t this_sibling_map; cpumask_var_t this_core_map; + cpumask_var_t this_book_map; cpumask_var_t send_covered; cpumask_var_t tmpmask; struct sched_group **sched_group_nodes; @@ -6525,6 +6674,7 @@ enum s_alloc { sa_rootdomain, sa_tmpmask, sa_send_covered, + sa_this_book_map, sa_this_core_map, sa_this_sibling_map, sa_nodemask, @@ -6560,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, #ifdef CONFIG_SCHED_MC static DEFINE_PER_CPU(struct static_sched_domain, core_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); -#endif /* CONFIG_SCHED_MC */ -#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) static int cpu_to_core_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *mask) { int group; - +#ifdef CONFIG_SCHED_SMT cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); group = cpumask_first(mask); +#else + group = cpu; +#endif if (sg) *sg = &per_cpu(sched_group_core, group).sg; return group; } -#elif defined(CONFIG_SCHED_MC) +#endif /* CONFIG_SCHED_MC */ + +/* + * book sched-domains: + */ +#ifdef CONFIG_SCHED_BOOK +static DEFINE_PER_CPU(struct static_sched_domain, book_domains); +static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); + static int -cpu_to_core_group(int cpu, const struct cpumask *cpu_map, - struct sched_group **sg, struct cpumask *unused) +cpu_to_book_group(int cpu, const struct cpumask *cpu_map, + struct sched_group **sg, struct cpumask *mask) { + int group = cpu; +#ifdef CONFIG_SCHED_MC + cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); + group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_SMT) + cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); + group = cpumask_first(mask); +#endif if (sg) - *sg = &per_cpu(sched_group_core, cpu).sg; - return cpu; + *sg = &per_cpu(sched_group_book, group).sg; + return group; } -#endif +#endif /* CONFIG_SCHED_BOOK */ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); @@ -6594,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, struct sched_group **sg, struct cpumask *mask) { int group; -#ifdef CONFIG_SCHED_MC +#ifdef CONFIG_SCHED_BOOK + cpumask_and(mask, cpu_book_mask(cpu), cpu_map); + group = cpumask_first(mask); +#elif defined(CONFIG_SCHED_MC) cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) @@ -6855,6 +7025,9 @@ SD_INIT_FUNC(CPU) #ifdef CONFIG_SCHED_MC SD_INIT_FUNC(MC) #endif +#ifdef CONFIG_SCHED_BOOK + SD_INIT_FUNC(BOOK) +#endif static int default_relax_domain_level = -1; @@ -6904,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, free_cpumask_var(d->tmpmask); /* fall through */ case sa_send_covered: free_cpumask_var(d->send_covered); /* fall through */ + case sa_this_book_map: + free_cpumask_var(d->this_book_map); /* fall through */ case sa_this_core_map: free_cpumask_var(d->this_core_map); /* fall through */ case sa_this_sibling_map: @@ -6950,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_nodemask; if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) return sa_this_sibling_map; - if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) return sa_this_core_map; + if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + return sa_this_book_map; if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) return sa_send_covered; d->rd = alloc_rootdomain(); @@ -7009,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, return sd; } +static struct sched_domain *__build_book_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_BOOK + sd = &per_cpu(book_domains, i).sd; + SD_INIT(sd, BOOK); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); +#endif + return sd; +} + static struct sched_domain *__build_mc_sched_domain(struct s_data *d, const struct cpumask *cpu_map, struct sched_domain_attr *attr, struct sched_domain *parent, int i) @@ -7066,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, d->send_covered, d->tmpmask); break; #endif +#ifdef CONFIG_SCHED_BOOK + case SD_LV_BOOK: /* set up book groups */ + cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); + if (cpu == cpumask_first(d->this_book_map)) + init_sched_build_groups(d->this_book_map, cpu_map, + &cpu_to_book_group, + d->send_covered, d->tmpmask); + break; +#endif case SD_LV_CPU: /* set up physical groups */ cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); if (!cpumask_empty(d->nodemask)) @@ -7113,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_numa_sched_domains(&d, cpu_map, attr, i); sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); + sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); } for_each_cpu(i, cpu_map) { build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); + build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); build_sched_groups(&d, SD_LV_MC, cpu_map, i); } @@ -7149,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map, init_sched_groups_power(i, sd); } #endif +#ifdef CONFIG_SCHED_BOOK + for_each_cpu(i, cpu_map) { + sd = &per_cpu(book_domains, i).sd; + init_sched_groups_power(i, sd); + } +#endif for_each_cpu(i, cpu_map) { sd = &per_cpu(phys_domains, i).sd; @@ -7174,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) sd = &per_cpu(core_domains, i).sd; +#elif defined(CONFIG_SCHED_BOOK) + sd = &per_cpu(book_domains, i).sd; #else sd = &per_cpu(phys_domains, i).sd; #endif @@ -8078,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; - err_free_rq: +err_free_rq: kfree(cfs_rq); - err: +err: return 0; } @@ -8168,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) return 1; - err_free_rq: +err_free_rq: kfree(rt_rq); - err: +err: return 0; } @@ -8528,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg, raw_spin_unlock(&rt_rq->rt_runtime_lock); } raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); - unlock: +unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index db3f674ca49..933f3d1b62e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -25,7 +25,7 @@ /* * Targeted preemption latency for CPU-bound tasks: - * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of * 'timeslice length' - timeslices in CFS are of variable length @@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling /* * Minimal preemption granularity for CPU-bound tasks: - * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_min_granularity = 750000ULL; unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; @@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock; + u64 now = rq_of(cfs_rq)->clock_task; unsigned long delta_exec; if (unlikely(!curr)) @@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) /* * We are starting a new run period: */ - se->exec_start = rq_of(cfs_rq)->clock; + se->exec_start = rq_of(cfs_rq)->clock_task; } /************************************************** @@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p, set_task_cpu(p, this_cpu); activate_task(this_rq, p, 0); check_preempt_curr(this_rq, p, 0); + + /* re-arm NEWIDLE balancing when moving tasks */ + src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost; + this_rq->idle_stamp = 0; } /* @@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq->clock, sd); + tsk_cache_hot = task_hot(p, rq->clock_task, sd); if (!tsk_cache_hot || sd->nr_balance_failed > sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS @@ -2030,12 +2034,14 @@ struct sd_lb_stats { unsigned long this_load; unsigned long this_load_per_task; unsigned long this_nr_running; + unsigned long this_has_capacity; /* Statistics of the busiest group */ unsigned long max_load; unsigned long busiest_load_per_task; unsigned long busiest_nr_running; unsigned long busiest_group_capacity; + unsigned long busiest_has_capacity; int group_imb; /* Is there imbalance in this sd */ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -2058,6 +2064,7 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long group_capacity; int group_imb; /* Is there an imbalance in the group ? */ + int group_has_capacity; /* Is there extra capacity in the group? */ }; /** @@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu) u64 total, available; total = sched_avg_period() + (rq->clock - rq->age_stamp); - available = total - rq->rt_avg; + + if (unlikely(total < rq->rt_avg)) { + /* Ensures that power won't end up being negative */ + available = 0; + } else { + available = total - rq->rt_avg; + } if (unlikely((s64)total < SCHED_LOAD_SCALE)) total = SCHED_LOAD_SCALE; @@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) { - unsigned long load, max_cpu_load, min_cpu_load; + unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; int i; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long avg_load_per_task = 0; @@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, /* Tally up the load of all CPUs in the group */ max_cpu_load = 0; min_cpu_load = ~0UL; + max_nr_running = 0; for_each_cpu_and(i, sched_group_cpus(group), cpus) { struct rq *rq = cpu_rq(i); @@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, load = target_load(i, load_idx); } else { load = source_load(i, load_idx); - if (load > max_cpu_load) + if (load > max_cpu_load) { max_cpu_load = load; + max_nr_running = rq->nr_running; + } if (min_cpu_load > load) min_cpu_load = load; } @@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if (sgs->sum_nr_running) avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) + if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; - sgs->group_capacity = - DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); + sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); + + if (sgs->group_capacity > sgs->sum_nr_running) + sgs->group_has_capacity = 1; } /** @@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, /* * In case the child domain prefers tasks go to siblings * first, lower the sg capacity to one so that we'll try - * and move all the excess tasks away. + * and move all the excess tasks away. We lower the capacity + * of a group only if the local group has the capacity to fit + * these excess tasks, i.e. nr_running < group_capacity. The + * extra check prevents the case where you always pull from the + * heaviest group when it is already under-utilized (possible + * with a large weight task outweighs the tasks on the system). */ - if (prefer_sibling) + if (prefer_sibling && !local_group && sds->this_has_capacity) sgs.group_capacity = min(sgs.group_capacity, 1UL); if (local_group) { @@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->this = sg; sds->this_nr_running = sgs.sum_nr_running; sds->this_load_per_task = sgs.sum_weighted_load; + sds->this_has_capacity = sgs.group_has_capacity; } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { sds->max_load = sgs.avg_load; sds->busiest = sg; sds->busiest_nr_running = sgs.sum_nr_running; sds->busiest_group_capacity = sgs.group_capacity; sds->busiest_load_per_task = sgs.sum_weighted_load; + sds->busiest_has_capacity = sgs.group_has_capacity; sds->group_imb = sgs.group_imb; } @@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, return fix_small_imbalance(sds, this_cpu, imbalance); } + /******* find_busiest_group() helpers end here *********************/ /** @@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * 4) This group is more busy than the avg busieness at this * sched_domain. * 5) The imbalance is within the specified limit. + * + * Note: when doing newidle balance, if the local group has excess + * capacity (i.e. nr_running < group_capacity) and the busiest group + * does not have any capacity, we force a load balance to pull tasks + * to the local group. In this case, we skip past checks 3, 4 and 5. */ if (!(*balance)) goto ret; @@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ + if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && + !sds.busiest_has_capacity) + goto force_balance; + if (sds.this_load >= sds.max_load) goto out_balanced; @@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) goto out_balanced; +force_balance: /* Looks like there is an imbalance. Compute it */ calculate_imbalance(&sds, this_cpu, imbalance); return sds.busiest; @@ -3031,7 +3068,14 @@ redo: if (!ld_moved) { schedstat_inc(sd, lb_failed[idle]); - sd->nr_balance_failed++; + /* + * Increment the failure counter only on periodic balance. + * We do not want newidle balance, which can be very + * frequent, pollute the failure counter causing + * excessive cache_hot migrations and active balances. + */ + if (idle != CPU_NEWLY_IDLE) + sd->nr_balance_failed++; if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), this_cpu)) { @@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq) interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; - if (pulled_task) { - this_rq->idle_stamp = 0; + if (pulled_task) break; - } } raw_spin_lock(&this_rq->lock); @@ -3751,8 +3793,11 @@ static void task_fork_fair(struct task_struct *p) update_rq_clock(rq); - if (unlikely(task_cpu(p) != this_cpu)) + if (unlikely(task_cpu(p) != this_cpu)) { + rcu_read_lock(); __set_task_cpu(p, this_cpu); + rcu_read_unlock(); + } update_curr(cfs_rq); diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 83c66e8ad3e..185f920ec1a 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1) * release the lock. Decreases scheduling overhead. */ SCHED_FEAT(OWNER_SPIN, 1) + +/* + * Decrement CPU power based on irq activity + */ +SCHED_FEAT(NONIRQ_POWER, 1) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d10c80ebb67..bea7d79f7e9 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq) if (!task_has_rt_policy(curr)) return; - delta_exec = rq->clock - curr->se.exec_start; + delta_exec = rq->clock_task - curr->se.exec_start; if (unlikely((s64)delta_exec < 0)) delta_exec = 0; @@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq->clock; + curr->se.exec_start = rq->clock_task; cpuacct_charge(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); @@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) * runqueue. Otherwise simply start this RT task * on its current runqueue. * - * We want to avoid overloading runqueues. Even if - * the RT task is of higher priority than the current RT task. - * RT tasks behave differently than other tasks. If - * one gets preempted, we try to push it off to another queue. - * So trying to keep a preempting RT task on the same - * cache hot CPU will force the running RT task to - * a cold CPU. So we waste all the cache for the lower - * RT task in hopes of saving some of a RT task - * that is just being woken and probably will have - * cold cache anyway. + * We want to avoid overloading runqueues. If the woken + * task is a higher priority, then it will stay on this CPU + * and the lower prio task should be moved to another CPU. + * Even though this will probably make the lower prio task + * lose its cache, we do not want to bounce a higher task + * around just because it gave up its CPU, perhaps for a + * lock? + * + * For equal prio tasks, we just let the scheduler sort it out. */ if (unlikely(rt_task(rq->curr)) && + (rq->curr->rt.nr_cpus_allowed < 2 || + rq->curr->prio < p->prio) && (p->rt.nr_cpus_allowed > 1)) { int cpu = find_lowest_rq(p); @@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } while (rt_rq); p = rt_task_of(rt_se); - p->se.exec_start = rq->clock; + p->se.exec_start = rq->clock_task; return p; } @@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) for_each_leaf_rt_rq(rt_rq, rq) { array = &rt_rq->active; idx = sched_find_first_bit(array->bitmap); - next_idx: +next_idx: if (idx >= MAX_RT_PRIO) continue; if (next && next->prio < idx) @@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq) if (!next_task) return 0; - retry: +retry: if (unlikely(next_task == rq->curr)) { WARN_ON(1); return 0; @@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq) * but possible) */ } - skip: +skip: double_unlock_balance(this_rq, src_rq); } @@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && - p->rt.nr_cpus_allowed > 1) + p->rt.nr_cpus_allowed > 1 && + rt_task(rq->curr) && + (rq->curr->rt.nr_cpus_allowed < 2 || + rq->curr->prio < p->prio)) push_rt_tasks(rq); } @@ -1709,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq) { struct task_struct *p = rq->curr; - p->se.exec_start = rq->clock; + p->se.exec_start = rq->clock_task; /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c new file mode 100644 index 00000000000..45bddc0c104 --- /dev/null +++ b/kernel/sched_stoptask.c @@ -0,0 +1,108 @@ +/* + * stop-task scheduling class. + * + * The stop task is the highest priority task in the system, it preempts + * everything and will be preempted by nothing. + * + * See kernel/stop_machine.c + */ + +#ifdef CONFIG_SMP +static int +select_task_rq_stop(struct rq *rq, struct task_struct *p, + int sd_flag, int flags) +{ + return task_cpu(p); /* stop tasks as never migrate */ +} +#endif /* CONFIG_SMP */ + +static void +check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) +{ + resched_task(rq->curr); /* we preempt everything */ +} + +static struct task_struct *pick_next_task_stop(struct rq *rq) +{ + struct task_struct *stop = rq->stop; + + if (stop && stop->state == TASK_RUNNING) + return stop; + + return NULL; +} + +static void +enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) +{ +} + +static void +dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) +{ +} + +static void yield_task_stop(struct rq *rq) +{ + BUG(); /* the stop task should never yield, its pointless. */ +} + +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) +{ +} + +static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) +{ +} + +static void set_curr_task_stop(struct rq *rq) +{ +} + +static void switched_to_stop(struct rq *rq, struct task_struct *p, + int running) +{ + BUG(); /* its impossible to change to this class */ +} + +static void prio_changed_stop(struct rq *rq, struct task_struct *p, + int oldprio, int running) +{ + BUG(); /* how!?, what priority? */ +} + +static unsigned int +get_rr_interval_stop(struct rq *rq, struct task_struct *task) +{ + return 0; +} + +/* + * Simple, special scheduling class for the per-CPU stop tasks: + */ +static const struct sched_class stop_sched_class = { + .next = &rt_sched_class, + + .enqueue_task = enqueue_task_stop, + .dequeue_task = dequeue_task_stop, + .yield_task = yield_task_stop, + + .check_preempt_curr = check_preempt_curr_stop, + + .pick_next_task = pick_next_task_stop, + .put_prev_task = put_prev_task_stop, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_stop, +#endif + + .set_curr_task = set_curr_task_stop, + .task_tick = task_tick_stop, + + .get_rr_interval = get_rr_interval_stop, + + .prio_changed = prio_changed_stop, + .switched_to = switched_to_stop, + + /* no .task_new for stop tasks */ +}; diff --git a/kernel/softirq.c b/kernel/softirq.c index 07b4f1b1a73..fc978889b19 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -77,11 +77,21 @@ void wakeup_softirqd(void) } /* + * preempt_count and SOFTIRQ_OFFSET usage: + * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving + * softirq processing. + * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) + * on local_bh_disable or local_bh_enable. + * This lets us distinguish between whether we are currently processing + * softirq and whether we just have bh disabled. + */ + +/* * This one is for softirq.c-internal use, * where hardirqs are disabled legitimately: */ #ifdef CONFIG_TRACE_IRQFLAGS -static void __local_bh_disable(unsigned long ip) +static void __local_bh_disable(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip) * We must manually increment preempt_count here and manually * call the trace_preempt_off later. */ - preempt_count() += SOFTIRQ_OFFSET; + preempt_count() += cnt; /* * Were softirqs turned off above: */ - if (softirq_count() == SOFTIRQ_OFFSET) + if (softirq_count() == cnt) trace_softirqs_off(ip); raw_local_irq_restore(flags); - if (preempt_count() == SOFTIRQ_OFFSET) + if (preempt_count() == cnt) trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); } #else /* !CONFIG_TRACE_IRQFLAGS */ -static inline void __local_bh_disable(unsigned long ip) +static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) { - add_preempt_count(SOFTIRQ_OFFSET); + add_preempt_count(cnt); barrier(); } #endif /* CONFIG_TRACE_IRQFLAGS */ void local_bh_disable(void) { - __local_bh_disable((unsigned long)__builtin_return_address(0)); + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(local_bh_disable); +static void __local_bh_enable(unsigned int cnt) +{ + WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(!irqs_disabled()); + + if (softirq_count() == cnt) + trace_softirqs_on((unsigned long)__builtin_return_address(0)); + sub_preempt_count(cnt); +} + /* * Special-case - softirqs can safely be enabled in * cond_resched_softirq(), or by __do_softirq(), @@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable); */ void _local_bh_enable(void) { - WARN_ON_ONCE(in_irq()); - WARN_ON_ONCE(!irqs_disabled()); - - if (softirq_count() == SOFTIRQ_OFFSET) - trace_softirqs_on((unsigned long)__builtin_return_address(0)); - sub_preempt_count(SOFTIRQ_OFFSET); + __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); } EXPORT_SYMBOL(_local_bh_enable); @@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip) /* * Are softirqs going to be turned on now: */ - if (softirq_count() == SOFTIRQ_OFFSET) + if (softirq_count() == SOFTIRQ_DISABLE_OFFSET) trace_softirqs_on(ip); /* * Keep preemption disabled until we are done with * softirq processing: */ - sub_preempt_count(SOFTIRQ_OFFSET - 1); + sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); if (unlikely(!in_interrupt() && local_softirq_pending())) do_softirq(); @@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void) pending = local_softirq_pending(); account_system_vtime(current); - __local_bh_disable((unsigned long)__builtin_return_address(0)); + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_OFFSET); lockdep_softirq_enter(); cpu = smp_processor_id(); @@ -245,7 +262,7 @@ restart: lockdep_softirq_exit(); account_system_vtime(current); - _local_bh_enable(); + __local_bh_enable(SOFTIRQ_OFFSET); } #ifndef __ARCH_HAS_DO_SOFTIRQ @@ -279,10 +296,16 @@ void irq_enter(void) rcu_irq_enter(); if (idle_cpu(cpu) && !in_interrupt()) { - __irq_enter(); + /* + * Prevent raise_softirq from needlessly waking up ksoftirqd + * here, as softirq will be serviced on return from interrupt. + */ + local_bh_disable(); tick_check_idle(cpu); - } else - __irq_enter(); + _local_bh_enable(); + } + + __irq_enter(); } #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED @@ -696,6 +719,7 @@ static int run_ksoftirqd(void * __bind_cpu) { set_current_state(TASK_INTERRUPTIBLE); + current->flags |= PF_KSOFTIRQD; while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending()) { @@ -886,17 +910,14 @@ int __init __weak early_irq_init(void) return 0; } +#ifdef CONFIG_GENERIC_HARDIRQS int __init __weak arch_probe_nr_irqs(void) { - return 0; + return NR_IRQS_LEGACY; } int __init __weak arch_early_irq_init(void) { return 0; } - -int __weak arch_init_chip_data(struct irq_desc *desc, int node) -{ - return 0; -} +#endif diff --git a/kernel/srcu.c b/kernel/srcu.c index 2980da3fd50..c71e0750053 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) int __init_srcu_struct(struct srcu_struct *sp, const char *name, struct lock_class_key *key) { -#ifdef CONFIG_DEBUG_LOCK_ALLOC /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)sp, sizeof(*sp)); lockdep_init_map(&sp->dep_map, name, key, 0); -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ return init_srcu_struct_fields(sp); } EXPORT_SYMBOL_GPL(__init_srcu_struct); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 4372ccb2512..090c28812ce 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -287,11 +287,12 @@ repeat: goto repeat; } +extern void sched_set_stop_task(int cpu, struct task_struct *stop); + /* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; unsigned int cpu = (unsigned long)hcpu; struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); struct task_struct *p; @@ -304,13 +305,13 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, cpu); if (IS_ERR(p)) return NOTIFY_BAD; - sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); get_task_struct(p); + kthread_bind(p, cpu); + sched_set_stop_task(cpu, p); stopper->thread = p; break; case CPU_ONLINE: - kthread_bind(stopper->thread, cpu); /* strictly unnecessary, as first user will wake it */ wake_up_process(stopper->thread); /* mark enabled */ @@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, { struct cpu_stop_work *work; + sched_set_stop_task(cpu, NULL); /* kill the stopper */ kthread_stop(stopper->thread); /* drain remaining works */ diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c63116863a8..d2321891538 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -149,10 +149,18 @@ static void ntp_update_offset(long offset) time_reftime = get_seconds(); offset64 = offset; - freq_adj = (offset64 * secs) << - (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); + freq_adj = ntp_update_offset_fll(offset64, secs); - freq_adj += ntp_update_offset_fll(offset64, secs); + /* + * Clamp update interval to reduce PLL gain with low + * sampling rate (e.g. intermittent network connection) + * to avoid instability. + */ + if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) + secs = 1 << (SHIFT_PLL + 1 + time_constant); + + freq_adj += (offset64 * secs) << + (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 65fb077ea79..ebd80d50c47 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1638,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file) ret = ftrace_avail_open(inode, file); if (!ret) { - m = (struct seq_file *)file->private_data; - iter = (struct ftrace_iterator *)m->private; + m = file->private_data; + iter = m->private; iter->flags = FTRACE_ITER_FAILURES; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9ec59f54115..001bcd2ccf4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp) static int tracing_release(struct inode *inode, struct file *file) { - struct seq_file *m = (struct seq_file *)file->private_data; + struct seq_file *m = file->private_data; struct trace_iterator *iter; int cpu; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d39b3c5454a..9021f8c0c0c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); +void trace_graph_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, int pc); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ef49e9370b2..76b05980225 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -262,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) return trace_graph_entry(trace); } +static void +__trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long flags, int pc) +{ + u64 time = trace_clock_local(); + struct ftrace_graph_ent ent = { + .func = ip, + .depth = 0, + }; + struct ftrace_graph_ret ret = { + .func = ip, + .depth = 0, + .calltime = time, + .rettime = time, + }; + + __trace_graph_entry(tr, &ent, flags, pc); + __trace_graph_return(tr, &ret, flags, pc); +} + +void +trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + __trace_graph_function(tr, ip, flags, pc); +} + void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, @@ -888,12 +916,20 @@ check_irq_entry(struct trace_iterator *iter, u32 flags, unsigned long addr, int depth) { int cpu = iter->cpu; + int *depth_irq; struct fgraph_data *data = iter->private; - int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); - if (flags & TRACE_GRAPH_PRINT_IRQS) + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) return 0; + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + /* * We are inside the irq code */ @@ -926,12 +962,20 @@ static int check_irq_return(struct trace_iterator *iter, u32 flags, int depth) { int cpu = iter->cpu; + int *depth_irq; struct fgraph_data *data = iter->private; - int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); - if (flags & TRACE_GRAPH_PRINT_IRQS) + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) return 0; + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + /* * We are not inside the irq code. */ @@ -1163,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t -print_graph_function_flags(struct trace_iterator *iter, u32 flags) +__print_graph_function_flags(struct trace_iterator *iter, u32 flags) { struct ftrace_graph_ent_entry *field; struct fgraph_data *data = iter->private; @@ -1226,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) static enum print_line_t print_graph_function(struct trace_iterator *iter) { - return print_graph_function_flags(iter, tracer_flags.val); + return __print_graph_function_flags(iter, tracer_flags.val); +} + +enum print_line_t print_graph_function_flags(struct trace_iterator *iter, + u32 flags) +{ + if (trace_flags & TRACE_ITER_LATENCY_FMT) + flags |= TRACE_GRAPH_PRINT_DURATION; + else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + return __print_graph_function_flags(iter, flags); } static enum print_line_t @@ -1258,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) seq_printf(s, "#%.*s|||| / \n", size, spaces); } -void print_graph_headers_flags(struct seq_file *s, u32 flags) +static void __print_graph_headers_flags(struct seq_file *s, u32 flags) { int lat = trace_flags & TRACE_ITER_LATENCY_FMT; @@ -1299,6 +1354,23 @@ void print_graph_headers(struct seq_file *s) print_graph_headers_flags(s, tracer_flags.val); } +void print_graph_headers_flags(struct seq_file *s, u32 flags) +{ + struct trace_iterator *iter = s->private; + + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + print_trace_header(s, iter); + flags |= TRACE_GRAPH_PRINT_DURATION; + } else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + __print_graph_headers_flags(s, flags); +} + void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 73a6b0601f2..5cf8c602b88 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence; #ifdef CONFIG_FUNCTION_TRACER /* - * irqsoff uses its own tracer function to keep the overhead down: + * Prologue for the preempt and irqs off function tracers. + * + * Returns 1 if it is OK to continue, and data->disabled is + * incremented. + * 0 if the trace is to be ignored, and data->disabled + * is kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +static int func_prolog_dec(struct trace_array *tr, + struct trace_array_cpu **data, + unsigned long *flags) { - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; int cpu; @@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) */ cpu = raw_smp_processor_id(); if (likely(!per_cpu(tracing_cpu, cpu))) - return; + return 0; - local_save_flags(flags); + local_save_flags(*flags); /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return; + if (!irqs_disabled_flags(*flags)) + return 0; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (likely(disabled == 1)) - trace_function(tr, ip, parent_ip, flags, preempt_count()); + return 1; + + atomic_dec(&(*data)->disabled); + + return 0; +} + +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + + trace_function(tr, ip, parent_ip, flags, preempt_count()); atomic_dec(&data->disabled); } @@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; int ret; - int cpu; int pc; - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) + if (!func_prolog_dec(tr, &data, &flags)) return 0; - local_save_flags(flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return 0; - - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); - } else - ret = 0; - + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); atomic_dec(&data->disabled); + return ret; } @@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; - int cpu; int pc; - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) + if (!func_prolog_dec(tr, &data, &flags)) return; - local_save_flags(flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return; - - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); - } - + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); atomic_dec(&data->disabled); } @@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter) static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { - u32 flags = GRAPH_TRACER_FLAGS; - - if (trace_flags & TRACE_ITER_LATENCY_FMT) - flags |= TRACE_GRAPH_PRINT_DURATION; - else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - /* * In graph mode call the graph tracer output function, * otherwise go with the TRACE_FN event handler */ if (is_graph()) - return print_graph_function_flags(iter, flags); + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); return TRACE_TYPE_UNHANDLED; } static void irqsoff_print_header(struct seq_file *s) { - if (is_graph()) { - struct trace_iterator *iter = s->private; - u32 flags = GRAPH_TRACER_FLAGS; - - if (trace_flags & TRACE_ITER_LATENCY_FMT) { - /* print nothing if the buffers are empty */ - if (trace_empty(iter)) - return; - - print_trace_header(s, iter); - flags |= TRACE_GRAPH_PRINT_DURATION; - } else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - - print_graph_headers_flags(s, flags); - } else + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else trace_default_header(s); } static void -trace_graph_function(struct trace_array *tr, - unsigned long ip, unsigned long flags, int pc) -{ - u64 time = trace_clock_local(); - struct ftrace_graph_ent ent = { - .func = ip, - .depth = 0, - }; - struct ftrace_graph_ret ret = { - .func = ip, - .depth = 0, - .calltime = time, - .rettime = time, - }; - - __trace_graph_entry(tr, &ent, flags, pc); - __trace_graph_return(tr, &ret, flags, pc); -} - -static void __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - if (!is_graph()) + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else trace_function(tr, ip, parent_ip, flags, pc); - else { - trace_graph_function(tr, parent_ip, flags, pc); - trace_graph_function(tr, ip, flags, pc); - } } #else diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4086eae6e81..7319559ed59 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -31,48 +31,98 @@ static int wakeup_rt; static arch_spinlock_t wakeup_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static void wakeup_reset(struct trace_array *tr); static void __wakeup_reset(struct trace_array *tr); +static int wakeup_graph_entry(struct ftrace_graph_ent *trace); +static void wakeup_graph_return(struct ftrace_graph_ret *trace); static int save_lat_flag; +#define TRACE_DISPLAY_GRAPH 1 + +static struct tracer_opt trace_opts[] = { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* display latency trace as call graph */ + { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +#endif + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, + .opts = trace_opts, +}; + +#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) + #ifdef CONFIG_FUNCTION_TRACER + /* - * irqsoff uses its own tracer function to keep the overhead down: + * Prologue for the wakeup function tracers. + * + * Returns 1 if it is OK to continue, and preemption + * is disabled and data->disabled is incremented. + * 0 if the trace is to be ignored, and preemption + * is not disabled and data->disabled is + * kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +static int +func_prolog_preempt_disable(struct trace_array *tr, + struct trace_array_cpu **data, + int *pc) { - struct trace_array *tr = wakeup_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; int cpu; - int pc; if (likely(!wakeup_task)) - return; + return 0; - pc = preempt_count(); + *pc = preempt_count(); preempt_disable_notrace(); cpu = raw_smp_processor_id(); if (cpu != wakeup_current_cpu) goto out_enable; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; - local_irq_save(flags); + return 1; - trace_function(tr, ip, parent_ip, flags, pc); +out: + atomic_dec(&(*data)->disabled); + +out_enable: + preempt_enable_notrace(); + return 0; +} +/* + * wakeup uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_irq_save(flags); + trace_function(tr, ip, parent_ip, flags, pc); local_irq_restore(flags); - out: atomic_dec(&data->disabled); - out_enable: preempt_enable_notrace(); } @@ -82,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly = }; #endif /* CONFIG_FUNCTION_TRACER */ +static int start_func_tracer(int graph) +{ + int ret; + + if (!graph) + ret = register_ftrace_function(&trace_ops); + else + ret = register_ftrace_graph(&wakeup_graph_return, + &wakeup_graph_entry); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_func_tracer(int graph) +{ + tracer_enabled = 0; + + if (!graph) + unregister_ftrace_function(&trace_ops); + else + unregister_ftrace_graph(); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + + if (!(bit & TRACE_DISPLAY_GRAPH)) + return -EINVAL; + + if (!(is_graph() ^ set)) + return 0; + + stop_func_tracer(!set); + + wakeup_reset(wakeup_trace); + tracing_max_latency = 0; + + return start_func_tracer(set); +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc, ret = 0; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return 0; + + local_save_flags(flags); + ret = __trace_graph_entry(tr, trace, flags, pc); + atomic_dec(&data->disabled); + preempt_enable_notrace(); + + return ret; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_save_flags(flags); + __trace_graph_return(tr, trace, flags, pc); + atomic_dec(&data->disabled); + + preempt_enable_notrace(); + return; +} + +static void wakeup_trace_open(struct trace_iterator *iter) +{ + if (is_graph()) + graph_trace_open(iter); +} + +static void wakeup_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph()) + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); + + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_print_header(struct seq_file *s) +{ + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else + trace_default_header(s); +} + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else + trace_function(tr, ip, parent_ip, flags, pc); +} +#else +#define __trace_function trace_function + +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + return -EINVAL; +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } +static void wakeup_print_header(struct seq_file *s) { } +static void wakeup_trace_open(struct trace_iterator *iter) { } +static void wakeup_trace_close(struct trace_iterator *iter) { } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* * Should this new latency be reported/recorded? */ @@ -152,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore, /* The task we are waiting for is waking up */ data = wakeup_trace->data[wakeup_cpu]; - trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); T0 = data->preempt_timestamp; @@ -252,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) * is not called by an assembly function (where as schedule is) * it should be safe to use it here. */ - trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: arch_spin_unlock(&wakeup_lock); @@ -303,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - register_ftrace_function(&trace_ops); - - if (tracing_is_enabled()) - tracer_enabled = 1; - else - tracer_enabled = 0; + if (start_func_tracer(is_graph())) + printk(KERN_ERR "failed to start wakeup tracer\n"); return; fail_deprobe_wake_new: @@ -320,7 +516,7 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - unregister_ftrace_function(&trace_ops); + stop_func_tracer(is_graph()); unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_wakeup, NULL); unregister_trace_sched_wakeup(probe_wakeup, NULL); @@ -379,9 +575,15 @@ static struct tracer wakeup_tracer __read_mostly = .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, .use_max_tr = 1, }; @@ -394,9 +596,15 @@ static struct tracer wakeup_rt_tracer __read_mostly = .stop = wakeup_tracer_stop, .wait_pipe = poll_wait_pipe, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, .use_max_tr = 1, }; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index dc8e16824b5..bafba687a6d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -196,7 +196,7 @@ static struct perf_event_attr wd_hw_attr = { }; /* Callback function for perf event subsystem */ -void watchdog_overflow_callback(struct perf_event *event, int nmi, +static void watchdog_overflow_callback(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { |