diff options
39 files changed, 501 insertions, 377 deletions
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 42aa078a5e4..5a621c6d22a 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -585,8 +585,7 @@ handle_ipi(struct pt_regs *regs) switch (which) { case IPI_RESCHEDULE: - /* Reschedule callback. Everything to be done - is done by the interrupt return path. */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 8fe05ad932e..7a561eb731e 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -560,10 +560,7 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs) break; case IPI_RESCHEDULE: - /* - * nothing more to do - eveything is - * done on the interrupt return path - */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c index 8bce5ed031e..1fbd94c4445 100644 --- a/arch/blackfin/mach-common/smp.c +++ b/arch/blackfin/mach-common/smp.c @@ -177,6 +177,9 @@ static irqreturn_t ipi_handler_int1(int irq, void *dev_instance) while (msg_queue->count) { msg = &msg_queue->ipi_message[msg_queue->head]; switch (msg->type) { + case BFIN_IPI_RESCHEDULE: + scheduler_ipi(); + break; case BFIN_IPI_CALL_FUNC: spin_unlock_irqrestore(&msg_queue->lock, flags); ipi_call_function(cpu, msg); diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 4c9e3e1ba5d..66cc75657e2 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -342,15 +342,18 @@ irqreturn_t crisv32_ipi_interrupt(int irq, void *dev_id) ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi); + if (ipi.vector & IPI_SCHEDULE) { + scheduler_ipi(); + } if (ipi.vector & IPI_CALL) { - func(info); + func(info); } if (ipi.vector & IPI_FLUSH_TLB) { - if (flush_mm == FLUSH_ALL) - __flush_tlb_all(); - else if (flush_vma == FLUSH_ALL) + if (flush_mm == FLUSH_ALL) + __flush_tlb_all(); + else if (flush_vma == FLUSH_ALL) __flush_tlb_mm(flush_mm); - else + else __flush_tlb_page(flush_vma, flush_addr); } diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 5b704740f16..782c3a357f2 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -31,6 +31,7 @@ #include <linux/irq.h> #include <linux/ratelimit.h> #include <linux/acpi.h> +#include <linux/sched.h> #include <asm/delay.h> #include <asm/intrinsics.h> @@ -496,6 +497,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) smp_local_flush_tlb(); kstat_incr_irqs_this_cpu(irq, desc); } else if (unlikely(IS_RESCHEDULE(vector))) { + scheduler_ipi(); kstat_incr_irqs_this_cpu(irq, desc); } else { ia64_setreg(_IA64_REG_CR_TPR, vector); diff --git a/arch/ia64/xen/irq_xen.c b/arch/ia64/xen/irq_xen.c index 108bb858acf..b279e142c63 100644 --- a/arch/ia64/xen/irq_xen.c +++ b/arch/ia64/xen/irq_xen.c @@ -92,6 +92,8 @@ static unsigned short saved_irq_cnt; static int xen_slab_ready; #ifdef CONFIG_SMP +#include <linux/sched.h> + /* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ, * it ends up to issue several memory accesses upon percpu data and * thus adds unnecessary traffic to other paths. @@ -99,7 +101,13 @@ static int xen_slab_ready; static irqreturn_t xen_dummy_handler(int irq, void *dev_id) { + return IRQ_HANDLED; +} +static irqreturn_t +xen_resched_handler(int irq, void *dev_id) +{ + scheduler_ipi(); return IRQ_HANDLED; } @@ -110,7 +118,7 @@ static struct irqaction xen_ipi_irqaction = { }; static struct irqaction xen_resched_irqaction = { - .handler = xen_dummy_handler, + .handler = xen_resched_handler, .flags = IRQF_DISABLED, .name = "resched" }; diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c index 31cef20b299..fc10b39893d 100644 --- a/arch/m32r/kernel/smp.c +++ b/arch/m32r/kernel/smp.c @@ -122,8 +122,6 @@ void smp_send_reschedule(int cpu_id) * * Description: This routine executes on CPU which received * 'RESCHEDULE_IPI'. - * Rescheduling is processed at the exit of interrupt - * operation. * * Born on Date: 2002.02.05 * @@ -138,7 +136,7 @@ void smp_send_reschedule(int cpu_id) *==========================================================================*/ void smp_reschedule_interrupt(void) { - /* nothing to do */ + scheduler_ipi(); } /*==========================================================================* diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c index ba78b21cc8d..76923eeb58b 100644 --- a/arch/mips/cavium-octeon/smp.c +++ b/arch/mips/cavium-octeon/smp.c @@ -44,6 +44,8 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id) if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); /* Check if we've been told to flush the icache */ if (action & SMP_ICACHE_FLUSH) diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c index 5a88cc4ccd5..cedac463374 100644 --- a/arch/mips/kernel/smtc.c +++ b/arch/mips/kernel/smtc.c @@ -929,7 +929,7 @@ static void post_direct_ipi(int cpu, struct smtc_ipi *pipi) static void ipi_resched_interrupt(void) { - /* Return from interrupt should be enough to cause scheduler check */ + scheduler_ipi(); } static void ipi_call_interrupt(void) diff --git a/arch/mips/mti-malta/malta-int.c b/arch/mips/mti-malta/malta-int.c index 9027061f0ea..7d93e6fbfa5 100644 --- a/arch/mips/mti-malta/malta-int.c +++ b/arch/mips/mti-malta/malta-int.c @@ -309,6 +309,8 @@ static void ipi_call_dispatch(void) static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) { + scheduler_ipi(); + return IRQ_HANDLED; } diff --git a/arch/mips/pmc-sierra/yosemite/smp.c b/arch/mips/pmc-sierra/yosemite/smp.c index efc9e889b34..2608752898c 100644 --- a/arch/mips/pmc-sierra/yosemite/smp.c +++ b/arch/mips/pmc-sierra/yosemite/smp.c @@ -55,6 +55,8 @@ void titan_mailbox_irq(void) if (status & 0x2) smp_call_function_interrupt(); + if (status & 0x4) + scheduler_ipi(); break; case 1: @@ -63,6 +65,8 @@ void titan_mailbox_irq(void) if (status & 0x2) smp_call_function_interrupt(); + if (status & 0x4) + scheduler_ipi(); break; } } diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c index 0a04603d577..b18b04e4857 100644 --- a/arch/mips/sgi-ip27/ip27-irq.c +++ b/arch/mips/sgi-ip27/ip27-irq.c @@ -147,8 +147,10 @@ static void ip27_do_irq_mask0(void) #ifdef CONFIG_SMP if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ); + scheduler_ipi(); } else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ); + scheduler_ipi(); } else if (pend0 & (1UL << CPU_CALL_A_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ); smp_call_function_interrupt(); diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c index 47b347c992e..d667875be56 100644 --- a/arch/mips/sibyte/bcm1480/smp.c +++ b/arch/mips/sibyte/bcm1480/smp.c @@ -20,6 +20,7 @@ #include <linux/delay.h> #include <linux/smp.h> #include <linux/kernel_stat.h> +#include <linux/sched.h> #include <asm/mmu_context.h> #include <asm/io.h> @@ -189,10 +190,8 @@ void bcm1480_mailbox_interrupt(void) /* Clear the mailbox to clear the interrupt */ __raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]); - /* - * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the - * interrupt will do the reschedule for us - */ + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c index c00a5cb1128..38e7f6bd792 100644 --- a/arch/mips/sibyte/sb1250/smp.c +++ b/arch/mips/sibyte/sb1250/smp.c @@ -21,6 +21,7 @@ #include <linux/interrupt.h> #include <linux/smp.h> #include <linux/kernel_stat.h> +#include <linux/sched.h> #include <asm/mmu_context.h> #include <asm/io.h> @@ -177,10 +178,8 @@ void sb1250_mailbox_interrupt(void) /* Clear the mailbox to clear the interrupt */ ____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]); - /* - * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the - * interrupt will do the reschedule for us - */ + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c index 226c826a219..83fb2791223 100644 --- a/arch/mn10300/kernel/smp.c +++ b/arch/mn10300/kernel/smp.c @@ -494,14 +494,11 @@ void smp_send_stop(void) * @irq: The interrupt number. * @dev_id: The device ID. * - * We need do nothing here, since the scheduling will be effected on our way - * back through entry.S. - * * Returns IRQ_HANDLED to indicate we handled the interrupt successfully. */ static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) { - /* do nothing */ + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 69d63d354ef..828305f19cf 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -155,10 +155,7 @@ ipi_interrupt(int irq, void *dev_id) case IPI_RESCHEDULE: smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu); - /* - * Reschedule callback. Everything to be - * done is done by the interrupt return path. - */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index cbdbb14be4b..9f9c204bef6 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -116,7 +116,7 @@ void smp_message_recv(int msg) generic_smp_call_function_interrupt(); break; case PPC_MSG_RESCHEDULE: - /* we notice need_resched on exit */ + scheduler_ipi(); break; case PPC_MSG_CALL_FUNC_SINGLE: generic_smp_call_function_single_interrupt(); @@ -146,7 +146,7 @@ static irqreturn_t call_function_action(int irq, void *data) static irqreturn_t reschedule_action(int irq, void *data) { - /* we just need the return path side effect of checking need_resched */ + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 63a97db83f9..63c7d9ff220 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -165,12 +165,12 @@ static void do_ext_call_interrupt(unsigned int ext_int_code, kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++; /* * handle bit signal external calls - * - * For the ec_schedule signal we have to do nothing. All the work - * is done automatically when we return from the interrupt. */ bits = xchg(&S390_lowcore.ext_call_fast, 0); + if (test_bit(ec_schedule, &bits)) + scheduler_ipi(); + if (test_bit(ec_call_function, &bits)) generic_smp_call_function_interrupt(); diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 509b36b4511..6207561ea34 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -20,6 +20,7 @@ #include <linux/module.h> #include <linux/cpu.h> #include <linux/interrupt.h> +#include <linux/sched.h> #include <asm/atomic.h> #include <asm/processor.h> #include <asm/system.h> @@ -323,6 +324,7 @@ void smp_message_recv(unsigned int msg) generic_smp_call_function_interrupt(); break; case SMP_MSG_RESCHEDULE: + scheduler_ipi(); break; case SMP_MSG_FUNCTION_SINGLE: generic_smp_call_function_single_interrupt(); diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 41102c5a670..d5b3958be0b 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -156,11 +156,11 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask) void smp_resched_interrupt(void) { + irq_enter(); + scheduler_ipi(); local_cpu_data().irq_resched_count++; - /* - * do nothing, since it all was about calling re-schedule - * routine called by interrupt return code. - */ + irq_exit(); + /* re-schedule routine called by interrupt return code. */ } void smp_call_function_single_interrupt(void) diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index c274a30c3cb..99cb17251bb 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1368,6 +1368,7 @@ void smp_send_reschedule(int cpu) void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) { clear_softint(1 << irq); + scheduler_ipi(); } /* This is a nop because we capture all other cpus diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c index a4293102ef8..c52224d5ed4 100644 --- a/arch/tile/kernel/smp.c +++ b/arch/tile/kernel/smp.c @@ -189,12 +189,8 @@ void flush_icache_range(unsigned long start, unsigned long end) /* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */ static irqreturn_t handle_reschedule_ipi(int irq, void *token) { - /* - * Nothing to do here; when we return from interrupt, the - * rescheduling will occur there. But do bump the interrupt - * profiler count in the meantime. - */ __get_cpu_var(irq_stat).irq_resched_count++; + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c index 106bf27e2a9..eefb107d2d7 100644 --- a/arch/um/kernel/smp.c +++ b/arch/um/kernel/smp.c @@ -173,7 +173,7 @@ void IPI_handler(int cpu) break; case 'R': - set_tsk_need_resched(current); + scheduler_ipi(); break; case 'S': diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 513deac7228..013e7eba83b 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait) } /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + scheduler_ipi(); /* * KVM uses this interrupt to force a cpu out of guest mode */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 30612441ed9..762b46ab14d 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -46,13 +46,12 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) { inc_irq_stat(irq_resched_count); + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 94b48bd40dd..c75471db576 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -51,7 +51,7 @@ struct mutex { spinlock_t wait_lock; struct list_head wait_list; #if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) - struct thread_info *owner; + struct task_struct *owner; #endif #ifdef CONFIG_DEBUG_MUTEXES const char *name; diff --git a/include/linux/sched.h b/include/linux/sched.h index 18d63cea284..94107a2c284 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -360,7 +360,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); -extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); +extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; struct user_namespace; @@ -1048,8 +1048,12 @@ struct sched_domain; #define WF_FORK 0x02 /* child wakeup after fork */ #define ENQUEUE_WAKEUP 1 -#define ENQUEUE_WAKING 2 -#define ENQUEUE_HEAD 4 +#define ENQUEUE_HEAD 2 +#ifdef CONFIG_SMP +#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#else +#define ENQUEUE_WAKING 0 +#endif #define DEQUEUE_SLEEP 1 @@ -1067,12 +1071,11 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct rq *rq, struct task_struct *p, - int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); - void (*task_waking) (struct rq *this_rq, struct task_struct *task); + void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, @@ -1200,10 +1203,10 @@ struct task_struct { int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - int oncpu; -#endif + struct task_struct *wake_entry; + int on_cpu; #endif + int on_rq; int prio, static_prio, normal_prio; unsigned int rt_priority; @@ -1274,6 +1277,7 @@ struct task_struct { /* Revert to default priority/policy when forking */ unsigned sched_reset_on_fork:1; + unsigned sched_contributes_to_load:1; pid_t pid; pid_t tgid; @@ -2192,8 +2196,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP +void scheduler_ipi(void); extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else +static inline void scheduler_ipi(void) { } static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) { diff --git a/init/Kconfig b/init/Kconfig index 56240e724d9..32745bfe059 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -827,6 +827,11 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. +config SCHED_TTWU_QUEUE + bool + depends on !SPARC32 + default y + config MM_OWNER bool diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index ec815a960b5..73da83aff41 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock) return; DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); + DEBUG_LOCKS_WARN_ON(lock->owner != current); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); mutex_clear_owner(lock); } diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 57d527a16f9..0799fd3e4cf 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current_thread_info(); + lock->owner = current; } static inline void mutex_clear_owner(struct mutex *lock) diff --git a/kernel/mutex.c b/kernel/mutex.c index c4195fa9890..fe4706cb0c5 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -160,7 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ for (;;) { - struct thread_info *owner; + struct task_struct *owner; /* * If we own the BKL, then don't spin. The owner of diff --git a/kernel/mutex.h b/kernel/mutex.h index 67578ca48f9..4115fbf83b1 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h @@ -19,7 +19,7 @@ #ifdef CONFIG_SMP static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current_thread_info(); + lock->owner = current; } static inline void mutex_clear_owner(struct mutex *lock) diff --git a/kernel/sched.c b/kernel/sched.c index 312f8b95c2d..8c9d804dc07 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -312,6 +312,9 @@ struct cfs_rq { u64 exec_clock; u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif struct rb_root tasks_timeline; struct rb_node *rb_leftmost; @@ -553,6 +556,10 @@ struct rq { unsigned int ttwu_count; unsigned int ttwu_local; #endif + +#ifdef CONFIG_SMP + struct task_struct *wake_list; +#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -596,7 +603,7 @@ static inline int cpu_of(struct rq *rq) * Return the group to which this tasks belongs. * * We use task_subsys_state_check() and extend the RCU verification - * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() + * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach() * holds that lock for each task it moves into the cgroup. Therefore * by holding that lock, we pin the task to the current cgroup. */ @@ -606,7 +613,7 @@ static inline struct task_group *task_group(struct task_struct *p) struct cgroup_subsys_state *css; css = task_subsys_state_check(p, cpu_cgroup_subsys_id, - lockdep_is_held(&task_rq(p)->lock)); + lockdep_is_held(&p->pi_lock)); tg = container_of(css, struct task_group, css); return autogroup_task_group(p, tg); @@ -838,18 +845,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) return rq->curr == p; } -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline int task_running(struct rq *rq, struct task_struct *p) { +#ifdef CONFIG_SMP + return p->on_cpu; +#else return task_current(rq, p); +#endif } +#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = current; @@ -865,15 +893,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->oncpu; -#else - return task_current(rq, p); -#endif -} - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { #ifdef CONFIG_SMP @@ -882,7 +901,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) * SMP rebalancing from interrupt is the only thing that cares * here. */ - next->oncpu = 1; + next->on_cpu = 1; #endif #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW raw_spin_unlock_irq(&rq->lock); @@ -895,12 +914,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { #ifdef CONFIG_SMP /* - * After ->oncpu is cleared, the task can be moved to a different CPU. + * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. */ smp_wmb(); - prev->oncpu = 0; + prev->on_cpu = 0; #endif #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); @@ -909,23 +928,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* - * Check whether the task is waking, we use this to synchronize ->cpus_allowed - * against ttwu(). - */ -static inline int task_is_waking(struct task_struct *p) -{ - return unlikely(p->state == TASK_WAKING); -} - -/* - * __task_rq_lock - lock the runqueue a given task resides on. - * Must be called interrupts disabled. + * __task_rq_lock - lock the rq @p resides on. */ static inline struct rq *__task_rq_lock(struct task_struct *p) __acquires(rq->lock) { struct rq *rq; + lockdep_assert_held(&p->pi_lock); + for (;;) { rq = task_rq(p); raw_spin_lock(&rq->lock); @@ -936,22 +947,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) } /* - * task_rq_lock - lock the runqueue a given task resides on and disable - * interrupts. Note the ordering: we can safely lookup the task_rq without - * explicitly disabling preemption. + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. */ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) __acquires(rq->lock) { struct rq *rq; for (;;) { - local_irq_save(*flags); + raw_spin_lock_irqsave(&p->pi_lock, *flags); rq = task_rq(p); raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p))) return rq; - raw_spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } } @@ -961,10 +972,13 @@ static void __task_rq_unlock(struct rq *rq) raw_spin_unlock(&rq->lock); } -static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) __releases(rq->lock) + __releases(p->pi_lock) { - raw_spin_unlock_irqrestore(&rq->lock, *flags); + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } /* @@ -1773,7 +1787,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) update_rq_clock(rq); sched_info_queued(p); p->sched_class->enqueue_task(rq, p, flags); - p->se.on_rq = 1; } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -1781,7 +1794,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) update_rq_clock(rq); sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, flags); - p->se.on_rq = 0; } /* @@ -2116,7 +2128,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) + if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) rq->skip_clock_update = 1; } @@ -2162,6 +2174,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) */ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock))); +#endif #endif trace_sched_migrate_task(p, new_cpu); @@ -2185,13 +2202,15 @@ static int migration_cpu_stop(void *data); * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static bool migrate_task(struct task_struct *p, struct rq *rq) +static bool need_migrate_task(struct task_struct *p) { /* * If the task is not on a runqueue (and not running), then * the next wake-up will properly place the task. */ - return p->se.on_rq || task_running(rq, p); + bool running = p->on_rq || p->on_cpu; + smp_rmb(); /* finish_lock_switch() */ + return running; } /* @@ -2251,11 +2270,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(rq, p); - on_rq = p->se.on_rq; + on_rq = p->on_rq; ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); /* * If it changed from the expected state, bail out now. @@ -2330,7 +2349,7 @@ EXPORT_SYMBOL_GPL(kick_process); #ifdef CONFIG_SMP /* - * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. + * ->cpus_allowed is protected by both rq->lock and p->pi_lock */ static int select_fallback_rq(int cpu, struct task_struct *p) { @@ -2363,12 +2382,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } /* - * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) { - int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -2394,27 +2413,60 @@ static void update_avg(u64 *avg, u64 sample) } #endif -static inline void ttwu_activate(struct task_struct *p, struct rq *rq, - bool is_sync, bool is_migrate, bool is_local, - unsigned long en_flags) +static void +ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { +#ifdef CONFIG_SCHEDSTATS + struct rq *rq = this_rq(); + +#ifdef CONFIG_SMP + int this_cpu = smp_processor_id(); + + if (cpu == this_cpu) { + schedstat_inc(rq, ttwu_local); + schedstat_inc(p, se.statistics.nr_wakeups_local); + } else { + struct sched_domain *sd; + + schedstat_inc(p, se.statistics.nr_wakeups_remote); + for_each_domain(this_cpu, sd) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } + } + } +#endif /* CONFIG_SMP */ + + schedstat_inc(rq, ttwu_count); schedstat_inc(p, se.statistics.nr_wakeups); - if (is_sync) + + if (wake_flags & WF_SYNC) schedstat_inc(p, se.statistics.nr_wakeups_sync); - if (is_migrate) + + if (cpu != task_cpu(p)) schedstat_inc(p, se.statistics.nr_wakeups_migrate); - if (is_local) - schedstat_inc(p, se.statistics.nr_wakeups_local); - else - schedstat_inc(p, se.statistics.nr_wakeups_remote); +#endif /* CONFIG_SCHEDSTATS */ +} + +static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) +{ activate_task(rq, p, en_flags); + p->on_rq = 1; + + /* if a worker is waking up, notify workqueue */ + if (p->flags & PF_WQ_WORKER) + wq_worker_waking_up(p, cpu_of(rq)); } -static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, - int wake_flags, bool success) +/* + * Mark the task runnable and perform wakeup-preemption. + */ +static void +ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) { - trace_sched_wakeup(p, success); + trace_sched_wakeup(p, true); check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; @@ -2433,9 +2485,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, rq->idle_stamp = 0; } #endif - /* if a worker is waking up, notify workqueue */ - if ((p->flags & PF_WQ_WORKER) && success) - wq_worker_waking_up(p, cpu_of(rq)); +} + +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +{ +#ifdef CONFIG_SMP + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; +#endif + + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); + ttwu_do_wakeup(rq, p, wake_flags); +} + +/* + * Called in case the task @p isn't fully descheduled from its runqueue, + * in this case we must do a remote wakeup. Its a 'light' wakeup though, + * since all we need to do is flip p->state to TASK_RUNNING, since + * the task is still ->on_rq. + */ +static int ttwu_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (p->on_rq) { + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; +} + +#ifdef CONFIG_SMP +static void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + raw_spin_lock(&rq->lock); + + while (list) { + struct task_struct *p = list; + list = list->wake_entry; + ttwu_do_activate(rq, p, 0); + } + + raw_spin_unlock(&rq->lock); +} + +void scheduler_ipi(void) +{ + sched_ttwu_pending(); +} + +static void ttwu_queue_remote(struct task_struct *p, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *next = rq->wake_list; + + for (;;) { + struct task_struct *old = next; + + p->wake_entry = next; + next = cmpxchg(&rq->wake_list, old, p); + if (next == old) + break; + } + + if (!next) + smp_send_reschedule(cpu); +} +#endif + +static void ttwu_queue(struct task_struct *p, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + +#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE) + if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { + ttwu_queue_remote(p, cpu); + return; + } +#endif + + raw_spin_lock(&rq->lock); + ttwu_do_activate(rq, p, 0); + raw_spin_unlock(&rq->lock); } /** @@ -2453,92 +2595,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, * Returns %true if @p was woken up, %false if it was already running * or @state didn't match @p's state. */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, - int wake_flags) +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { - int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - unsigned long en_flags = ENQUEUE_WAKEUP; - struct rq *rq; - - this_cpu = get_cpu(); + int cpu, success = 0; smp_wmb(); - rq = task_rq_lock(p, &flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); if (!(p->state & state)) goto out; - if (p->se.on_rq) - goto out_running; - + success = 1; /* we're going to change ->state */ cpu = task_cpu(p); - orig_cpu = cpu; -#ifdef CONFIG_SMP - if (unlikely(task_running(rq, p))) - goto out_activate; + if (p->on_rq && ttwu_remote(p, wake_flags)) + goto stat; +#ifdef CONFIG_SMP /* - * In order to handle concurrent wakeups and release the rq->lock - * we put the task in TASK_WAKING state. - * - * First fix up the nr_uninterruptible count: + * If the owning (remote) cpu is still in the middle of schedule() with + * this task as prev, wait until its done referencing the task. */ - if (task_contributes_to_load(p)) { - if (likely(cpu_online(orig_cpu))) - rq->nr_uninterruptible--; - else - this_rq()->nr_uninterruptible--; - } - p->state = TASK_WAKING; - - if (p->sched_class->task_waking) { - p->sched_class->task_waking(rq, p); - en_flags |= ENQUEUE_WAKING; + while (p->on_cpu) { +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + /* + * If called from interrupt context we could have landed in the + * middle of schedule(), in this case we should take care not + * to spin on ->on_cpu if p is current, since that would + * deadlock. + */ + if (p == current) { + ttwu_queue(p, cpu); + goto stat; + } +#endif + cpu_relax(); } - - cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); - if (cpu != orig_cpu) - set_task_cpu(p, cpu); - __task_rq_unlock(rq); - - rq = cpu_rq(cpu); - raw_spin_lock(&rq->lock); - /* - * We migrated the task without holding either rq->lock, however - * since the task is not on the task list itself, nobody else - * will try and migrate the task, hence the rq should match the - * cpu we just moved it to. + * Pairs with the smp_wmb() in finish_lock_switch(). */ - WARN_ON(task_cpu(p) != cpu); - WARN_ON(p->state != TASK_WAKING); + smp_rmb(); -#ifdef CONFIG_SCHEDSTATS - schedstat_inc(rq, ttwu_count); - if (cpu == this_cpu) - schedstat_inc(rq, ttwu_local); - else { - struct sched_domain *sd; - for_each_domain(this_cpu, sd) { - if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); - break; - } - } - } -#endif /* CONFIG_SCHEDSTATS */ + p->sched_contributes_to_load = !!task_contributes_to_load(p); + p->state = TASK_WAKING; + + if (p->sched_class->task_waking) + p->sched_class->task_waking(p); -out_activate: + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + if (task_cpu(p) != cpu) + set_task_cpu(p, cpu); #endif /* CONFIG_SMP */ - ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, - cpu == this_cpu, en_flags); - success = 1; -out_running: - ttwu_post_activation(p, rq, wake_flags, success); + + ttwu_queue(p, cpu); +stat: + ttwu_stat(p, cpu, wake_flags); out: - task_rq_unlock(rq, &flags); - put_cpu(); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); return success; } @@ -2547,31 +2661,34 @@ out: * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened * - * Put @p on the run-queue if it's not already there. The caller must + * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not - * the current task. this_rq() stays locked over invocation. + * the current task. */ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - bool success = false; BUG_ON(rq != this_rq()); BUG_ON(p == current); lockdep_assert_held(&rq->lock); + if (!raw_spin_trylock(&p->pi_lock)) { + raw_spin_unlock(&rq->lock); + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + } + if (!(p->state & TASK_NORMAL)) - return; + goto out; - if (!p->se.on_rq) { - if (likely(!task_running(rq, p))) { - schedstat_inc(rq, ttwu_count); - schedstat_inc(rq, ttwu_local); - } - ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); - success = true; - } - ttwu_post_activation(p, rq, 0, success); + if (!p->on_rq) + ttwu_activate(rq, p, ENQUEUE_WAKEUP); + + ttwu_do_wakeup(rq, p, 0); + ttwu_stat(p, smp_processor_id(), 0); +out: + raw_spin_unlock(&p->pi_lock); } /** @@ -2604,19 +2721,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) */ static void __sched_fork(struct task_struct *p) { + p->on_rq = 0; + + p->se.on_rq = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif INIT_LIST_HEAD(&p->rt.run_list); - p->se.on_rq = 0; - INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2628,6 +2747,7 @@ static void __sched_fork(struct task_struct *p) */ void sched_fork(struct task_struct *p, int clone_flags) { + unsigned long flags; int cpu = get_cpu(); __sched_fork(p); @@ -2678,16 +2798,16 @@ void sched_fork(struct task_struct *p, int clone_flags) * * Silence PROVE_RCU. */ - rcu_read_lock(); + raw_spin_lock_irqsave(&p->pi_lock, flags); set_task_cpu(p, cpu); - rcu_read_unlock(); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - p->oncpu = 0; +#if defined(CONFIG_SMP) + p->on_cpu = 0; #endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ @@ -2711,37 +2831,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; - int cpu __maybe_unused = get_cpu(); + raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_SMP - rq = task_rq_lock(p, &flags); - p->state = TASK_WAKING; - /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug - * - * We set TASK_WAKING so that select_task_rq() can drop rq->lock - * without people poking at ->cpus_allowed. */ - cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); - set_task_cpu(p, cpu); - - p->state = TASK_RUNNING; - task_rq_unlock(rq, &flags); + set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); #endif - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); activate_task(rq, p, 0); - trace_sched_wakeup_new(p, 1); + p->on_rq = 1; + trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); #endif - task_rq_unlock(rq, &flags); - put_cpu(); + task_rq_unlock(rq, p, &flags); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -3450,27 +3560,22 @@ void sched_exec(void) { struct task_struct *p = current; unsigned long flags; - struct rq *rq; int dest_cpu; - rq = task_rq_lock(p, &flags); - dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); + raw_spin_lock_irqsave(&p->pi_lock, flags); + dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; - /* - * select_task_rq() can race against ->cpus_allowed - */ - if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && - likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { + if (likely(cpu_active(dest_cpu))) { struct migration_arg arg = { p, dest_cpu }; - task_rq_unlock(rq, &flags); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); return; } unlock: - task_rq_unlock(rq, &flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); } #endif @@ -3507,7 +3612,7 @@ unsigned long long task_delta_exec(struct task_struct *p) rq = task_rq_lock(p, &flags); ns = do_task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -3525,7 +3630,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) rq = task_rq_lock(p, &flags); ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -3549,7 +3654,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p) rq = task_rq_lock(p, &flags); thread_group_cputime(p, &totals); ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -4035,7 +4140,7 @@ static inline void schedule_debug(struct task_struct *prev) static void put_prev_task(struct rq *rq, struct task_struct *prev) { - if (prev->se.on_rq) + if (prev->on_rq) update_rq_clock(rq); prev->sched_class->put_prev_task(rq, prev); } @@ -4097,11 +4202,13 @@ need_resched: if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { + deactivate_task(rq, prev, DEQUEUE_SLEEP); + prev->on_rq = 0; + /* - * If a worker is going to sleep, notify and - * ask workqueue whether it wants to wake up a - * task to maintain concurrency. If so, wake - * up the task. + * If a worker went to sleep, notify and ask workqueue + * whether it wants to wake up a task to maintain + * concurrency. */ if (prev->flags & PF_WQ_WORKER) { struct task_struct *to_wakeup; @@ -4110,11 +4217,10 @@ need_resched: if (to_wakeup) try_to_wake_up_local(to_wakeup); } - deactivate_task(rq, prev, DEQUEUE_SLEEP); /* - * If we are going to sleep and we have plugged IO queued, make - * sure to submit it to avoid deadlocks. + * If we are going to sleep and we have plugged IO + * queued, make sure to submit it to avoid deadlocks. */ if (blk_needs_flush_plug(prev)) { raw_spin_unlock(&rq->lock); @@ -4161,70 +4267,53 @@ need_resched: EXPORT_SYMBOL(schedule); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) -{ - unsigned int cpu; - struct rq *rq; - if (!sched_feat(OWNER_SPIN)) - return 0; +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ + bool ret = false; -#ifdef CONFIG_DEBUG_PAGEALLOC - /* - * Need to access the cpu field knowing that - * DEBUG_PAGEALLOC could have unmapped it if - * the mutex owner just released it and exited. - */ - if (probe_kernel_address(&owner->cpu, cpu)) - return 0; -#else - cpu = owner->cpu; -#endif + rcu_read_lock(); + if (lock->owner != owner) + goto fail; /* - * Even if the access succeeded (likely case), - * the cpu field may no longer be valid. + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * lock->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. */ - if (cpu >= nr_cpumask_bits) - return 0; + barrier(); - /* - * We need to validate that we can do a - * get_cpu() and that we have the percpu area. - */ - if (!cpu_online(cpu)) - return 0; + ret = owner->on_cpu; +fail: + rcu_read_unlock(); - rq = cpu_rq(cpu); + return ret; +} - for (;;) { - /* - * Owner changed, break to re-assess state. - */ - if (lock->owner != owner) { - /* - * If the lock has switched to a different owner, - * we likely have heavy contention. Return 0 to quit - * optimistic spinning and not contend further: - */ - if (lock->owner) - return 0; - break; - } +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ + if (!sched_feat(OWNER_SPIN)) + return 0; - /* - * Is that owner really running on that cpu? - */ - if (task_thread_info(rq->curr) != owner || need_resched()) + while (owner_running(lock, owner)) { + if (need_resched()) return 0; arch_mutex_cpu_relax(); } + /* + * If the owner changed to another task there is likely + * heavy contention, stop spinning. + */ + if (lock->owner) + return 0; + return 1; } #endif @@ -4684,19 +4773,18 @@ EXPORT_SYMBOL(sleep_on_timeout); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - unsigned long flags; int oldprio, on_rq, running; struct rq *rq; const struct sched_class *prev_class; BUG_ON(prio < 0 || prio > MAX_PRIO); - rq = task_rq_lock(p, &flags); + rq = __task_rq_lock(p); trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; - on_rq = p->se.on_rq; + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) dequeue_task(rq, p, 0); @@ -4716,7 +4804,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); check_class_changed(rq, p, prev_class, oldprio); - task_rq_unlock(rq, &flags); + __task_rq_unlock(rq); } #endif @@ -4744,7 +4832,7 @@ void set_user_nice(struct task_struct *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - on_rq = p->se.on_rq; + on_rq = p->on_rq; if (on_rq) dequeue_task(rq, p, 0); @@ -4764,7 +4852,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_task(rq->curr); } out_unlock: - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); } EXPORT_SYMBOL(set_user_nice); @@ -4878,8 +4966,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) static void __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) { - BUG_ON(p->se.on_rq); - p->policy = policy; p->rt_priority = prio; p->normal_prio = normal_prio(p); @@ -4994,20 +5080,17 @@ recheck: /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: - */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - /* + * * To be able to change p->policy safely, the appropriate * runqueue lock must be held. */ - rq = __task_rq_lock(p); + rq = task_rq_lock(p, &flags); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return -EINVAL; } @@ -5031,8 +5114,7 @@ recheck: if (rt_bandwidth_enabled() && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); return -EPERM; } } @@ -5041,11 +5123,10 @@ recheck: /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); goto recheck; } - on_rq = p->se.on_rq; + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) deactivate_task(rq, p, 0); @@ -5064,8 +5145,7 @@ recheck: activate_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio); - __task_rq_unlock(rq); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_rq_unlock(rq, p, &flags); rt_mutex_adjust_pi(p); @@ -5316,7 +5396,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; unsigned long flags; - struct rq *rq; int retval; get_online_cpus(); @@ -5331,9 +5410,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) if (retval) goto out_unlock; - rq = task_rq_lock(p, &flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); - task_rq_unlock(rq, &flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); @@ -5658,7 +5737,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, rq = task_rq_lock(p, &flags); time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); rcu_read_unlock(); jiffies_to_timespec(time_slice, &t); @@ -5776,8 +5855,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - idle->oncpu = 1; +#if defined(CONFIG_SMP) + idle->on_cpu = 1; #endif raw_spin_unlock_irqrestore(&rq->lock, flags); @@ -5881,18 +5960,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) unsigned int dest_cpu; int ret = 0; - /* - * Serialize against TASK_WAKING so that ttwu() and wunt() can - * drop the rq->lock and still rely on ->cpus_allowed. - */ -again: - while (task_is_waking(p)) - cpu_relax(); rq = task_rq_lock(p, &flags); - if (task_is_waking(p)) { - task_rq_unlock(rq, &flags); - goto again; - } if (!cpumask_intersects(new_mask, cpu_active_mask)) { ret = -EINVAL; @@ -5917,16 +5985,16 @@ again: goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (migrate_task(p, rq)) { + if (need_migrate_task(p)) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; } out: - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, p, &flags); return ret; } @@ -5954,6 +6022,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); + raw_spin_lock(&p->pi_lock); double_rq_lock(rq_src, rq_dest); /* Already moved. */ if (task_cpu(p) != src_cpu) @@ -5966,7 +6035,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ - if (p->se.on_rq) { + if (p->on_rq) { deactivate_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); activate_task(rq_dest, p, 0); @@ -5976,6 +6045,7 @@ done: ret = 1; fail: double_rq_unlock(rq_src, rq_dest); + raw_spin_unlock(&p->pi_lock); return ret; } @@ -6316,6 +6386,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) #ifdef CONFIG_HOTPLUG_CPU case CPU_DYING: + sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { @@ -8340,7 +8411,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) int old_prio = p->prio; int on_rq; - on_rq = p->se.on_rq; + on_rq = p->on_rq; if (on_rq) deactivate_task(rq, p, 0); __setscheduler(rq, p, SCHED_NORMAL, 0); @@ -8683,7 +8754,7 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); running = task_current(rq, tsk); - on_rq = tsk->se.on_rq; + on_rq = tsk->on_rq; if (on_rq) dequeue_task(rq, tsk, 0); @@ -8702,7 +8773,7 @@ void sched_move_task(struct task_struct *tsk) if (on_rq) enqueue_task(rq, tsk, 0); - task_rq_unlock(rq, &flags); + task_rq_unlock(rq, tsk, &flags); } #endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 7bacd83a415..3669bec6e13 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, p) { - if (!p->se.on_rq || task_cpu(p) != rq_cpu) + if (!p->on_rq || task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6fa833ab2cb..054cebb81f7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -358,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif } /* @@ -1372,12 +1376,25 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP -static void task_waking_fair(struct rq *rq, struct task_struct *p) +static void task_waking_fair(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 min_vruntime; - se->vruntime -= cfs_rq->min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; + + do { + min_vruntime_copy = cfs_rq->min_vruntime_copy; + smp_rmb(); + min_vruntime = cfs_rq->min_vruntime; + } while (min_vruntime != min_vruntime_copy); +#else + min_vruntime = cfs_rq->min_vruntime; +#endif + + se->vruntime -= min_vruntime; } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1657,7 +1674,7 @@ static int select_idle_sibling(struct task_struct *p, int target) * preempt must be disabled. */ static int -select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); @@ -1789,10 +1806,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se) * This is especially important for buddies when the leftmost * task is higher priority than the buddy. */ - if (unlikely(se->load.weight != NICE_0_LOAD)) - gran = calc_delta_fair(gran, se); - - return gran; + return calc_delta_fair(gran, se); } /* diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 68e69acc29b..be40f7371ee 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1) * Decrement CPU power based on irq activity */ SCHED_FEAT(NONIRQ_POWER, 1) + +/* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ +SCHED_FEAT(TTWU_QUEUE, 1) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index a776a639642..0a51882534e 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -7,7 +7,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index e7cebdc65f8..19ecb312737 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -977,13 +977,23 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) { + struct task_struct *curr; + struct rq *rq; + int cpu; + if (sd_flag != SD_BALANCE_WAKE) return smp_processor_id(); + cpu = task_cpu(p); + rq = cpu_rq(cpu); + + rcu_read_lock(); + curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + /* - * If the current task is an RT task, then + * If the current task on @p's runqueue is an RT task, then * try to see if we can wake this RT task up on another * runqueue. Otherwise simply start this RT task * on its current runqueue. @@ -997,21 +1007,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) * lock? * * For equal prio tasks, we just let the scheduler sort it out. + * + * Otherwise, just let it ride on the affined RQ and the + * post-schedule router will push the preempted task away + * + * This test is optimistic, if we get it wrong the load-balancer + * will have to sort it out. */ - if (unlikely(rt_task(rq->curr)) && - (rq->curr->rt.nr_cpus_allowed < 2 || - rq->curr->prio < p->prio) && + if (curr && unlikely(rt_task(curr)) && + (curr->rt.nr_cpus_allowed < 2 || + curr->prio < p->prio) && (p->rt.nr_cpus_allowed > 1)) { - int cpu = find_lowest_rq(p); + int target = find_lowest_rq(p); - return (cpu == -1) ? task_cpu(p) : cpu; + if (target != -1) + cpu = target; } + rcu_read_unlock(); - /* - * Otherwise, just let it ride on the affined RQ and the - * post-schedule router will push the preempted task away - */ - return task_cpu(p); + return cpu; } static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) @@ -1136,7 +1150,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1287,7 +1301,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || task_running(rq, task) || - !task->se.on_rq)) { + !task->on_rq)) { raw_spin_unlock(&lowest_rq->lock); lowest_rq = NULL; @@ -1321,7 +1335,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->rt.nr_cpus_allowed <= 1); - BUG_ON(!p->se.on_rq); + BUG_ON(!p->on_rq); BUG_ON(!rt_task(p)); return p; @@ -1467,7 +1481,7 @@ static int pull_rt_task(struct rq *this_rq) */ if (p && (p->prio < this_rq->rt.highest_prio.curr)) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->se.on_rq); + WARN_ON(!p->on_rq); /* * There's a chance that p is higher in priority @@ -1538,7 +1552,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, * Update the migration status of the RQ if we have an RT task * which is running AND changing its weight value. */ - if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { + if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { struct rq *rq = task_rq(p); if (!task_current(rq, p)) { @@ -1608,7 +1622,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (p->se.on_rq && !rq->rt.rt_nr_running) + if (p->on_rq && !rq->rt.rt_nr_running) pull_rt_task(rq); } @@ -1638,7 +1652,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (p->se.on_rq && rq->curr != p) { + if (p->on_rq && rq->curr != p) { #ifdef CONFIG_SMP if (rq->rt.overloaded && push_rt_task(rq) && /* Don't resched if we changed runqueues */ @@ -1657,7 +1671,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) static void prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->se.on_rq) + if (!p->on_rq) return; if (rq->curr == p) { diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 1ba2bd40fda..6f437632afa 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -9,8 +9,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct rq *rq, struct task_struct *p, - int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } @@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - if (stop && stop->se.on_rq) + if (stop && stop->on_rq) return stop; return NULL; |