From dce840a08702bd13a9a186e07e63d1ef82256b5e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:50 +0200 Subject: sched: Dynamically allocate sched_domain/sched_group data-structures Instead of relying on static allocations for the sched_domain and sched_group trees, dynamically allocate and RCU free them. Allocating this dynamically also allows for some build_sched_groups() simplification since we can now (like with other simplifications) rely on the sched_domain tree instead of hard-coded knowledge. One tricky to note is that detach_destroy_domains() needs to hold rcu_read_lock() over the entire tear-down, per-cpu is not sufficient since that can lead to partial sched_group existance (could possibly be solved by doing the tear-down backwards but this is much more robust). A concequence of the above is that we can no longer print the sched_domain debug stuff from cpu_attach_domain() since that might now run with preemption disabled (due to classic RCU etc.) and sched_domain_debug() does some GFP_KERNEL allocations. Another thing to note is that we now fully rely on normal RCU and not RCU-sched, this is because with the new and exiting RCU flavours we grew over the years BH doesn't necessarily hold off RCU-sched grace periods (-rt is known to break this). This would in fact already cause us grief since we do sched_domain/sched_group iterations from softirq context. This patch is somewhat larger than I would like it to be, but I didn't find any means of shrinking/splitting this. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.245307941@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4ec2c027e92..020b79d6c48 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -868,6 +868,7 @@ static inline int sd_power_saving_flags(void) struct sched_group { struct sched_group *next; /* Must be a circular list */ + atomic_t ref; /* * CPU power of this group, SCHED_LOAD_SCALE being max power for a @@ -973,6 +974,10 @@ struct sched_domain { #ifdef CONFIG_SCHED_DEBUG char *name; #endif + union { + void *private; /* used during construction */ + struct rcu_head rcu; /* used during destruction */ + }; unsigned int span_weight; /* -- cgit v1.2.3-70-g09d2 From 3859173d43658d51a749bc0201b943922577d39c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:53 +0200 Subject: sched: Reduce some allocation pressure Since we now allocate SD_LV_MAX * nr_cpu_ids sched_domain/sched_group structures when rebuilding the scheduler toplogy it might make sense to shrink that depending on the CONFIG_ options. This is only needed until we get rid of SD_LV_* alltogether and provide a full dynamic topology interface. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.406226449@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 020b79d6c48..5a9168b01db 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -897,12 +897,20 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) enum sched_domain_level { SD_LV_NONE = 0, +#ifdef CONFIG_SCHED_SMT SD_LV_SIBLING, +#endif +#ifdef CONFIG_SCHED_MC SD_LV_MC, +#endif +#ifdef CONFIG_SCHED_BOOK SD_LV_BOOK, +#endif SD_LV_CPU, +#ifdef CONFIG_NUMA SD_LV_NODE, SD_LV_ALLNODES, +#endif SD_LV_MAX }; -- cgit v1.2.3-70-g09d2 From 7dd04b730749f957c116f363524fd622b05e5141 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:09:56 +0200 Subject: sched: Remove some dead code Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.553814623@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ------ kernel/sched.c | 16 ---------------- 2 files changed, 22 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a9168b01db..09d9e02f2b6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -883,9 +883,6 @@ struct sched_group { * NOTE: this field is variable length. (Allocated dynamically * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) - * - * It is also be embedded into static data structures at build - * time. (See 'struct static_sched_group' in kernel/sched.c) */ unsigned long cpumask[0]; }; @@ -994,9 +991,6 @@ struct sched_domain { * NOTE: this field is variable length. (Allocated dynamically * by attaching extra space to the end of the structure, * depending on how many CPUs the kernel has booted up with) - * - * It is also be embedded into static data structures at build - * time. (See 'struct static_sched_domain' in kernel/sched.c) */ unsigned long span[0]; }; diff --git a/kernel/sched.c b/kernel/sched.c index f4d3a624c50..5ec685ce516 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6816,22 +6816,6 @@ static void sched_domain_node_span(int node, struct cpumask *span) int sched_smt_power_savings = 0, sched_mc_power_savings = 0; -/* - * The cpus mask in sched_group and sched_domain hangs off the end. - * - * ( See the the comments in include/linux/sched.h:struct sched_group - * and struct sched_domain. ) - */ -struct static_sched_group { - struct sched_group sg; - DECLARE_BITMAP(cpus, CONFIG_NR_CPUS); -}; - -struct static_sched_domain { - struct sched_domain sd; - DECLARE_BITMAP(span, CONFIG_NR_CPUS); -}; - struct sd_data { struct sched_domain **__percpu sd; struct sched_group **__percpu sg; -- cgit v1.2.3-70-g09d2 From 60495e7760d8ee364695006af37309b0755e0e17 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Apr 2011 14:10:04 +0200 Subject: sched: Dynamic sched_domain::level Remove the SD_LV_ enum and use dynamic level assignments. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110407122942.969433965@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 23 +++-------------------- kernel/cpuset.c | 2 +- kernel/sched.c | 9 ++++++--- 3 files changed, 10 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 09d9e02f2b6..e43e5b0ab0b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -892,25 +892,6 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) return to_cpumask(sg->cpumask); } -enum sched_domain_level { - SD_LV_NONE = 0, -#ifdef CONFIG_SCHED_SMT - SD_LV_SIBLING, -#endif -#ifdef CONFIG_SCHED_MC - SD_LV_MC, -#endif -#ifdef CONFIG_SCHED_BOOK - SD_LV_BOOK, -#endif - SD_LV_CPU, -#ifdef CONFIG_NUMA - SD_LV_NODE, - SD_LV_ALLNODES, -#endif - SD_LV_MAX -}; - struct sched_domain_attr { int relax_domain_level; }; @@ -919,6 +900,8 @@ struct sched_domain_attr { .relax_domain_level = -1, \ } +extern int sched_domain_level_max; + struct sched_domain { /* These fields must be setup */ struct sched_domain *parent; /* top domain must be null terminated */ @@ -936,7 +919,7 @@ struct sched_domain { unsigned int forkexec_idx; unsigned int smt_gain; int flags; /* See SD_* */ - enum sched_domain_level level; + int level; /* Runtime fields. */ unsigned long last_balance; /* init to jiffies. units in jiffies */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 33eee16addb..2bb8c2e98ff 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1159,7 +1159,7 @@ int current_cpuset_is_being_rebound(void) static int update_relax_domain_level(struct cpuset *cs, s64 val) { #ifdef CONFIG_SMP - if (val < -1 || val >= SD_LV_MAX) + if (val < -1 || val >= sched_domain_level_max) return -EINVAL; #endif diff --git a/kernel/sched.c b/kernel/sched.c index 3231e199742..506cb8147c7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6966,7 +6966,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ { \ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ *sd = SD_##type##_INIT; \ - sd->level = SD_LV_##type; \ SD_INIT_NAME(sd, type); \ sd->private = &tl->data; \ return sd; \ @@ -6988,13 +6987,14 @@ SD_INIT_FUNC(CPU) #endif static int default_relax_domain_level = -1; +int sched_domain_level_max; static int __init setup_relax_domain_level(char *str) { unsigned long val; val = simple_strtoul(str, NULL, 0); - if (val < SD_LV_MAX) + if (val < sched_domain_level_max) default_relax_domain_level = val; return 1; @@ -7173,8 +7173,11 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, set_domain_attribute(sd, attr); cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); - if (child) + if (child) { + sd->level = child->level + 1; + sched_domain_level_max = max(sched_domain_level_max, sd->level); child->parent = sd; + } sd->child = child; return sd; -- cgit v1.2.3-70-g09d2 From 184748cc50b2dceb8287f9fb657eda48ff8fcfe7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:39 +0200 Subject: sched: Provide scheduler_ipi() callback in response to smp_send_reschedule() For future rework of try_to_wake_up() we'd like to push part of that function onto the CPU the task is actually going to run on. In order to do so we need a generic callback from the existing scheduler IPI. This patch introduces such a generic callback: scheduler_ipi() and implements it as a NOP. BenH notes: PowerPC might use this IPI on offline CPUs under rare conditions! Acked-by: Russell King Acked-by: Martin Schwidefsky Acked-by: Chris Metcalf Acked-by: Jesper Nilsson Acked-by: Benjamin Herrenschmidt Signed-off-by: Ralf Baechle Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.744338123@chello.nl --- arch/alpha/kernel/smp.c | 3 +-- arch/arm/kernel/smp.c | 5 +---- arch/blackfin/mach-common/smp.c | 3 +++ arch/cris/arch-v32/kernel/smp.c | 13 ++++++++----- arch/ia64/kernel/irq_ia64.c | 2 ++ arch/ia64/xen/irq_xen.c | 10 +++++++++- arch/m32r/kernel/smp.c | 4 +--- arch/mips/cavium-octeon/smp.c | 2 ++ arch/mips/kernel/smtc.c | 2 +- arch/mips/mti-malta/malta-int.c | 2 ++ arch/mips/pmc-sierra/yosemite/smp.c | 4 ++++ arch/mips/sgi-ip27/ip27-irq.c | 2 ++ arch/mips/sibyte/bcm1480/smp.c | 7 +++---- arch/mips/sibyte/sb1250/smp.c | 7 +++---- arch/mn10300/kernel/smp.c | 5 +---- arch/parisc/kernel/smp.c | 5 +---- arch/powerpc/kernel/smp.c | 4 ++-- arch/s390/kernel/smp.c | 6 +++--- arch/sh/kernel/smp.c | 2 ++ arch/sparc/kernel/smp_32.c | 4 +++- arch/sparc/kernel/smp_64.c | 1 + arch/tile/kernel/smp.c | 6 +----- arch/um/kernel/smp.c | 2 +- arch/x86/kernel/smp.c | 5 ++--- arch/x86/xen/smp.c | 5 ++--- include/linux/sched.h | 2 ++ 26 files changed, 63 insertions(+), 50 deletions(-) (limited to 'include') diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 42aa078a5e4..5a621c6d22a 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -585,8 +585,7 @@ handle_ipi(struct pt_regs *regs) switch (which) { case IPI_RESCHEDULE: - /* Reschedule callback. Everything to be done - is done by the interrupt return path. */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 8fe05ad932e..7a561eb731e 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -560,10 +560,7 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs) break; case IPI_RESCHEDULE: - /* - * nothing more to do - eveything is - * done on the interrupt return path - */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c index 6e17a265c4d..326bb86f4d2 100644 --- a/arch/blackfin/mach-common/smp.c +++ b/arch/blackfin/mach-common/smp.c @@ -164,6 +164,9 @@ static irqreturn_t ipi_handler_int1(int irq, void *dev_instance) while (msg_queue->count) { msg = &msg_queue->ipi_message[msg_queue->head]; switch (msg->type) { + case BFIN_IPI_RESCHEDULE: + scheduler_ipi(); + break; case BFIN_IPI_CALL_FUNC: spin_unlock_irqrestore(&msg_queue->lock, flags); ipi_call_function(cpu, msg); diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 4c9e3e1ba5d..66cc75657e2 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -342,15 +342,18 @@ irqreturn_t crisv32_ipi_interrupt(int irq, void *dev_id) ipi = REG_RD(intr_vect, irq_regs[smp_processor_id()], rw_ipi); + if (ipi.vector & IPI_SCHEDULE) { + scheduler_ipi(); + } if (ipi.vector & IPI_CALL) { - func(info); + func(info); } if (ipi.vector & IPI_FLUSH_TLB) { - if (flush_mm == FLUSH_ALL) - __flush_tlb_all(); - else if (flush_vma == FLUSH_ALL) + if (flush_mm == FLUSH_ALL) + __flush_tlb_all(); + else if (flush_vma == FLUSH_ALL) __flush_tlb_mm(flush_mm); - else + else __flush_tlb_page(flush_vma, flush_addr); } diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 5b704740f16..782c3a357f2 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -496,6 +497,7 @@ ia64_handle_irq (ia64_vector vector, struct pt_regs *regs) smp_local_flush_tlb(); kstat_incr_irqs_this_cpu(irq, desc); } else if (unlikely(IS_RESCHEDULE(vector))) { + scheduler_ipi(); kstat_incr_irqs_this_cpu(irq, desc); } else { ia64_setreg(_IA64_REG_CR_TPR, vector); diff --git a/arch/ia64/xen/irq_xen.c b/arch/ia64/xen/irq_xen.c index 108bb858acf..b279e142c63 100644 --- a/arch/ia64/xen/irq_xen.c +++ b/arch/ia64/xen/irq_xen.c @@ -92,6 +92,8 @@ static unsigned short saved_irq_cnt; static int xen_slab_ready; #ifdef CONFIG_SMP +#include + /* Dummy stub. Though we may check XEN_RESCHEDULE_VECTOR before __do_IRQ, * it ends up to issue several memory accesses upon percpu data and * thus adds unnecessary traffic to other paths. @@ -99,7 +101,13 @@ static int xen_slab_ready; static irqreturn_t xen_dummy_handler(int irq, void *dev_id) { + return IRQ_HANDLED; +} +static irqreturn_t +xen_resched_handler(int irq, void *dev_id) +{ + scheduler_ipi(); return IRQ_HANDLED; } @@ -110,7 +118,7 @@ static struct irqaction xen_ipi_irqaction = { }; static struct irqaction xen_resched_irqaction = { - .handler = xen_dummy_handler, + .handler = xen_resched_handler, .flags = IRQF_DISABLED, .name = "resched" }; diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c index 31cef20b299..fc10b39893d 100644 --- a/arch/m32r/kernel/smp.c +++ b/arch/m32r/kernel/smp.c @@ -122,8 +122,6 @@ void smp_send_reschedule(int cpu_id) * * Description: This routine executes on CPU which received * 'RESCHEDULE_IPI'. - * Rescheduling is processed at the exit of interrupt - * operation. * * Born on Date: 2002.02.05 * @@ -138,7 +136,7 @@ void smp_send_reschedule(int cpu_id) *==========================================================================*/ void smp_reschedule_interrupt(void) { - /* nothing to do */ + scheduler_ipi(); } /*==========================================================================* diff --git a/arch/mips/cavium-octeon/smp.c b/arch/mips/cavium-octeon/smp.c index ba78b21cc8d..76923eeb58b 100644 --- a/arch/mips/cavium-octeon/smp.c +++ b/arch/mips/cavium-octeon/smp.c @@ -44,6 +44,8 @@ static irqreturn_t mailbox_interrupt(int irq, void *dev_id) if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); /* Check if we've been told to flush the icache */ if (action & SMP_ICACHE_FLUSH) diff --git a/arch/mips/kernel/smtc.c b/arch/mips/kernel/smtc.c index 5a88cc4ccd5..cedac463374 100644 --- a/arch/mips/kernel/smtc.c +++ b/arch/mips/kernel/smtc.c @@ -929,7 +929,7 @@ static void post_direct_ipi(int cpu, struct smtc_ipi *pipi) static void ipi_resched_interrupt(void) { - /* Return from interrupt should be enough to cause scheduler check */ + scheduler_ipi(); } static void ipi_call_interrupt(void) diff --git a/arch/mips/mti-malta/malta-int.c b/arch/mips/mti-malta/malta-int.c index 9027061f0ea..7d93e6fbfa5 100644 --- a/arch/mips/mti-malta/malta-int.c +++ b/arch/mips/mti-malta/malta-int.c @@ -309,6 +309,8 @@ static void ipi_call_dispatch(void) static irqreturn_t ipi_resched_interrupt(int irq, void *dev_id) { + scheduler_ipi(); + return IRQ_HANDLED; } diff --git a/arch/mips/pmc-sierra/yosemite/smp.c b/arch/mips/pmc-sierra/yosemite/smp.c index efc9e889b34..2608752898c 100644 --- a/arch/mips/pmc-sierra/yosemite/smp.c +++ b/arch/mips/pmc-sierra/yosemite/smp.c @@ -55,6 +55,8 @@ void titan_mailbox_irq(void) if (status & 0x2) smp_call_function_interrupt(); + if (status & 0x4) + scheduler_ipi(); break; case 1: @@ -63,6 +65,8 @@ void titan_mailbox_irq(void) if (status & 0x2) smp_call_function_interrupt(); + if (status & 0x4) + scheduler_ipi(); break; } } diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c index 0a04603d577..b18b04e4857 100644 --- a/arch/mips/sgi-ip27/ip27-irq.c +++ b/arch/mips/sgi-ip27/ip27-irq.c @@ -147,8 +147,10 @@ static void ip27_do_irq_mask0(void) #ifdef CONFIG_SMP if (pend0 & (1UL << CPU_RESCHED_A_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_RESCHED_A_IRQ); + scheduler_ipi(); } else if (pend0 & (1UL << CPU_RESCHED_B_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_RESCHED_B_IRQ); + scheduler_ipi(); } else if (pend0 & (1UL << CPU_CALL_A_IRQ)) { LOCAL_HUB_CLR_INTR(CPU_CALL_A_IRQ); smp_call_function_interrupt(); diff --git a/arch/mips/sibyte/bcm1480/smp.c b/arch/mips/sibyte/bcm1480/smp.c index 47b347c992e..d667875be56 100644 --- a/arch/mips/sibyte/bcm1480/smp.c +++ b/arch/mips/sibyte/bcm1480/smp.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -189,10 +190,8 @@ void bcm1480_mailbox_interrupt(void) /* Clear the mailbox to clear the interrupt */ __raw_writeq(((u64)action)<<48, mailbox_0_clear_regs[cpu]); - /* - * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the - * interrupt will do the reschedule for us - */ + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); diff --git a/arch/mips/sibyte/sb1250/smp.c b/arch/mips/sibyte/sb1250/smp.c index c00a5cb1128..38e7f6bd792 100644 --- a/arch/mips/sibyte/sb1250/smp.c +++ b/arch/mips/sibyte/sb1250/smp.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -177,10 +178,8 @@ void sb1250_mailbox_interrupt(void) /* Clear the mailbox to clear the interrupt */ ____raw_writeq(((u64)action) << 48, mailbox_clear_regs[cpu]); - /* - * Nothing to do for SMP_RESCHEDULE_YOURSELF; returning from the - * interrupt will do the reschedule for us - */ + if (action & SMP_RESCHEDULE_YOURSELF) + scheduler_ipi(); if (action & SMP_CALL_FUNCTION) smp_call_function_interrupt(); diff --git a/arch/mn10300/kernel/smp.c b/arch/mn10300/kernel/smp.c index 226c826a219..83fb2791223 100644 --- a/arch/mn10300/kernel/smp.c +++ b/arch/mn10300/kernel/smp.c @@ -494,14 +494,11 @@ void smp_send_stop(void) * @irq: The interrupt number. * @dev_id: The device ID. * - * We need do nothing here, since the scheduling will be effected on our way - * back through entry.S. - * * Returns IRQ_HANDLED to indicate we handled the interrupt successfully. */ static irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id) { - /* do nothing */ + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 69d63d354ef..828305f19cf 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -155,10 +155,7 @@ ipi_interrupt(int irq, void *dev_id) case IPI_RESCHEDULE: smp_debug(100, KERN_DEBUG "CPU%d IPI_RESCHEDULE\n", this_cpu); - /* - * Reschedule callback. Everything to be - * done is done by the interrupt return path. - */ + scheduler_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index cbdbb14be4b..9f9c204bef6 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -116,7 +116,7 @@ void smp_message_recv(int msg) generic_smp_call_function_interrupt(); break; case PPC_MSG_RESCHEDULE: - /* we notice need_resched on exit */ + scheduler_ipi(); break; case PPC_MSG_CALL_FUNC_SINGLE: generic_smp_call_function_single_interrupt(); @@ -146,7 +146,7 @@ static irqreturn_t call_function_action(int irq, void *data) static irqreturn_t reschedule_action(int irq, void *data) { - /* we just need the return path side effect of checking need_resched */ + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 63a97db83f9..63c7d9ff220 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -165,12 +165,12 @@ static void do_ext_call_interrupt(unsigned int ext_int_code, kstat_cpu(smp_processor_id()).irqs[EXTINT_IPI]++; /* * handle bit signal external calls - * - * For the ec_schedule signal we have to do nothing. All the work - * is done automatically when we return from the interrupt. */ bits = xchg(&S390_lowcore.ext_call_fast, 0); + if (test_bit(ec_schedule, &bits)) + scheduler_ipi(); + if (test_bit(ec_call_function, &bits)) generic_smp_call_function_interrupt(); diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 509b36b4511..6207561ea34 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -323,6 +324,7 @@ void smp_message_recv(unsigned int msg) generic_smp_call_function_interrupt(); break; case SMP_MSG_RESCHEDULE: + scheduler_ipi(); break; case SMP_MSG_FUNCTION_SINGLE: generic_smp_call_function_single_interrupt(); diff --git a/arch/sparc/kernel/smp_32.c b/arch/sparc/kernel/smp_32.c index 91c10fb7085..f95690c167b 100644 --- a/arch/sparc/kernel/smp_32.c +++ b/arch/sparc/kernel/smp_32.c @@ -125,7 +125,9 @@ struct linux_prom_registers smp_penguin_ctable __cpuinitdata = { 0 }; void smp_send_reschedule(int cpu) { - /* See sparc64 */ + /* + * XXX missing reschedule IPI, see scheduler_ipi() + */ } void smp_send_stop(void) diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 3e94a8c2323..9478da7fdb3 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1368,6 +1368,7 @@ void smp_send_reschedule(int cpu) void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs) { clear_softint(1 << irq); + scheduler_ipi(); } /* This is a nop because we capture all other cpus diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c index a4293102ef8..c52224d5ed4 100644 --- a/arch/tile/kernel/smp.c +++ b/arch/tile/kernel/smp.c @@ -189,12 +189,8 @@ void flush_icache_range(unsigned long start, unsigned long end) /* Called when smp_send_reschedule() triggers IRQ_RESCHEDULE. */ static irqreturn_t handle_reschedule_ipi(int irq, void *token) { - /* - * Nothing to do here; when we return from interrupt, the - * rescheduling will occur there. But do bump the interrupt - * profiler count in the meantime. - */ __get_cpu_var(irq_stat).irq_resched_count++; + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c index 106bf27e2a9..eefb107d2d7 100644 --- a/arch/um/kernel/smp.c +++ b/arch/um/kernel/smp.c @@ -173,7 +173,7 @@ void IPI_handler(int cpu) break; case 'R': - set_tsk_need_resched(current); + scheduler_ipi(); break; case 'S': diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 513deac7228..013e7eba83b 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait) } /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + scheduler_ipi(); /* * KVM uses this interrupt to force a cpu out of guest mode */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 30612441ed9..762b46ab14d 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -46,13 +46,12 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. */ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) { inc_irq_stat(irq_resched_count); + scheduler_ipi(); return IRQ_HANDLED; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 4ec2c027e92..758e27afcda 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2189,8 +2189,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP +static inline void scheduler_ipi(void) { } extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else +static inline void scheduler_ipi(void) { } static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) { -- cgit v1.2.3-70-g09d2 From 3ca7a440da394808571dad32d33d3bc0389982e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:40 +0200 Subject: sched: Always provide p->on_cpu Always provide p->on_cpu so that we can determine if its on a cpu without having to lock the rq. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152728.785452014@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 +--- kernel/sched.c | 46 +++++++++++++++++++++++++++++----------------- 2 files changed, 30 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 758e27afcda..3435837e89f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1200,9 +1200,7 @@ struct task_struct { int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP -#ifdef __ARCH_WANT_UNLOCKED_CTXSW - int oncpu; -#endif + int on_cpu; #endif int prio, static_prio, normal_prio; diff --git a/kernel/sched.c b/kernel/sched.c index a187c3fe027..cd2593e1a3e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -838,18 +838,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p) return rq->curr == p; } -#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline int task_running(struct rq *rq, struct task_struct *p) { +#ifdef CONFIG_SMP + return p->on_cpu; +#else return task_current(rq, p); +#endif } +#ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif } static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ rq->lock.owner = current; @@ -865,15 +886,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) } #else /* __ARCH_WANT_UNLOCKED_CTXSW */ -static inline int task_running(struct rq *rq, struct task_struct *p) -{ -#ifdef CONFIG_SMP - return p->oncpu; -#else - return task_current(rq, p); -#endif -} - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { #ifdef CONFIG_SMP @@ -882,7 +894,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) * SMP rebalancing from interrupt is the only thing that cares * here. */ - next->oncpu = 1; + next->on_cpu = 1; #endif #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW raw_spin_unlock_irq(&rq->lock); @@ -895,12 +907,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { #ifdef CONFIG_SMP /* - * After ->oncpu is cleared, the task can be moved to a different CPU. + * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. */ smp_wmb(); - prev->oncpu = 0; + prev->on_cpu = 0; #endif #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); @@ -2686,8 +2698,8 @@ void sched_fork(struct task_struct *p, int clone_flags) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - p->oncpu = 0; +#if defined(CONFIG_SMP) + p->on_cpu = 0; #endif #ifdef CONFIG_PREEMPT /* Want to start with kernel preemption disabled. */ @@ -5776,8 +5788,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; -#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) - idle->oncpu = 1; +#if defined(CONFIG_SMP) + idle->on_cpu = 1; #endif raw_spin_unlock_irqrestore(&rq->lock, flags); -- cgit v1.2.3-70-g09d2 From c6eb3dda25892f1f974f5420f63e6721aab02f6f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:41 +0200 Subject: mutex: Use p->on_cpu for the adaptive spin Since we now have p->on_cpu unconditionally available, use it to re-implement mutex_spin_on_owner. Requested-by: Thomas Gleixner Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.826338173@chello.nl --- include/linux/mutex.h | 2 +- include/linux/sched.h | 2 +- kernel/mutex-debug.c | 2 +- kernel/mutex-debug.h | 2 +- kernel/mutex.c | 2 +- kernel/mutex.h | 2 +- kernel/sched.c | 83 ++++++++++++++++++++------------------------------- 7 files changed, 39 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 94b48bd40dd..c75471db576 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -51,7 +51,7 @@ struct mutex { spinlock_t wait_lock; struct list_head wait_list; #if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) - struct thread_info *owner; + struct task_struct *owner; #endif #ifdef CONFIG_DEBUG_MUTEXES const char *name; diff --git a/include/linux/sched.h b/include/linux/sched.h index 3435837e89f..173850479e2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -360,7 +360,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); -extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner); +extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; struct user_namespace; diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index ec815a960b5..73da83aff41 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock) return; DEBUG_LOCKS_WARN_ON(lock->magic != lock); - DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); + DEBUG_LOCKS_WARN_ON(lock->owner != current); DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); mutex_clear_owner(lock); } diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index 57d527a16f9..0799fd3e4cf 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current_thread_info(); + lock->owner = current; } static inline void mutex_clear_owner(struct mutex *lock) diff --git a/kernel/mutex.c b/kernel/mutex.c index c4195fa9890..fe4706cb0c5 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -160,7 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ for (;;) { - struct thread_info *owner; + struct task_struct *owner; /* * If we own the BKL, then don't spin. The owner of diff --git a/kernel/mutex.h b/kernel/mutex.h index 67578ca48f9..4115fbf83b1 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h @@ -19,7 +19,7 @@ #ifdef CONFIG_SMP static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current_thread_info(); + lock->owner = current; } static inline void mutex_clear_owner(struct mutex *lock) diff --git a/kernel/sched.c b/kernel/sched.c index cd2593e1a3e..55cc50323ce 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4173,70 +4173,53 @@ need_resched: EXPORT_SYMBOL(schedule); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) -{ - unsigned int cpu; - struct rq *rq; - if (!sched_feat(OWNER_SPIN)) - return 0; +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ + bool ret = false; -#ifdef CONFIG_DEBUG_PAGEALLOC - /* - * Need to access the cpu field knowing that - * DEBUG_PAGEALLOC could have unmapped it if - * the mutex owner just released it and exited. - */ - if (probe_kernel_address(&owner->cpu, cpu)) - return 0; -#else - cpu = owner->cpu; -#endif + rcu_read_lock(); + if (lock->owner != owner) + goto fail; /* - * Even if the access succeeded (likely case), - * the cpu field may no longer be valid. + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * lock->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. */ - if (cpu >= nr_cpumask_bits) - return 0; + barrier(); - /* - * We need to validate that we can do a - * get_cpu() and that we have the percpu area. - */ - if (!cpu_online(cpu)) - return 0; + ret = owner->on_cpu; +fail: + rcu_read_unlock(); - rq = cpu_rq(cpu); + return ret; +} - for (;;) { - /* - * Owner changed, break to re-assess state. - */ - if (lock->owner != owner) { - /* - * If the lock has switched to a different owner, - * we likely have heavy contention. Return 0 to quit - * optimistic spinning and not contend further: - */ - if (lock->owner) - return 0; - break; - } +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ + if (!sched_feat(OWNER_SPIN)) + return 0; - /* - * Is that owner really running on that cpu? - */ - if (task_thread_info(rq->curr) != owner || need_resched()) + while (owner_running(lock, owner)) { + if (need_resched()) return 0; arch_mutex_cpu_relax(); } + /* + * If the owner changed to another task there is likely + * heavy contention, stop spinning. + */ + if (lock->owner) + return 0; + return 1; } #endif -- cgit v1.2.3-70-g09d2 From fd2f4419b4cbe8fe90796df9617c355762afd6a4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:44 +0200 Subject: sched: Provide p->on_rq Provide a generic p->on_rq because the p->se.on_rq semantics are unfavourable for lockless wakeups but needed for sched_fair. In particular, p->on_rq is only cleared when we actually dequeue the task in schedule() and not on any random dequeue as done by things like __migrate_task() and __sched_setscheduler(). This also allows us to remove p->se usage from !sched_fair code. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152728.949545047@chello.nl --- include/linux/sched.h | 1 + kernel/sched.c | 38 ++++++++++++++++++++------------------ kernel/sched_debug.c | 2 +- kernel/sched_rt.c | 16 ++++++++-------- kernel/sched_stoptask.c | 2 +- 5 files changed, 31 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 173850479e2..b33a700652d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1202,6 +1202,7 @@ struct task_struct { #ifdef CONFIG_SMP int on_cpu; #endif + int on_rq; int prio, static_prio, normal_prio; unsigned int rt_priority; diff --git a/kernel/sched.c b/kernel/sched.c index 4481638f917..dece28e505c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1785,7 +1785,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) update_rq_clock(rq); sched_info_queued(p); p->sched_class->enqueue_task(rq, p, flags); - p->se.on_rq = 1; } static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -1793,7 +1792,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) update_rq_clock(rq); sched_info_dequeued(p); p->sched_class->dequeue_task(rq, p, flags); - p->se.on_rq = 0; } /* @@ -2128,7 +2126,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ - if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) + if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) rq->skip_clock_update = 1; } @@ -2203,7 +2201,7 @@ static bool migrate_task(struct task_struct *p, struct rq *rq) * If the task is not on a runqueue (and not running), then * the next wake-up will properly place the task. */ - return p->se.on_rq || task_running(rq, p); + return p->on_rq || task_running(rq, p); } /* @@ -2263,7 +2261,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); running = task_running(rq, p); - on_rq = p->se.on_rq; + on_rq = p->on_rq; ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ @@ -2444,6 +2442,7 @@ ttwu_stat(struct rq *rq, struct task_struct *p, int cpu, int wake_flags) static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) { activate_task(rq, p, en_flags); + p->on_rq = 1; /* if a worker is waking up, notify workqueue */ if (p->flags & PF_WQ_WORKER) @@ -2506,7 +2505,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, cpu = task_cpu(p); - if (p->se.on_rq) + if (p->on_rq) goto out_running; orig_cpu = cpu; @@ -2583,7 +2582,7 @@ static void try_to_wake_up_local(struct task_struct *p) if (!(p->state & TASK_NORMAL)) return; - if (!p->se.on_rq) + if (!p->on_rq) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_post_activation(p, rq, 0); @@ -2620,19 +2619,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) */ static void __sched_fork(struct task_struct *p) { + p->on_rq = 0; + + p->se.on_rq = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_SCHEDSTATS memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif INIT_LIST_HEAD(&p->rt.run_list); - p->se.on_rq = 0; - INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2750,6 +2751,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) rq = task_rq_lock(p, &flags); activate_task(rq, p, 0); + p->on_rq = 1; trace_sched_wakeup_new(p, true); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -4051,7 +4053,7 @@ static inline void schedule_debug(struct task_struct *prev) static void put_prev_task(struct rq *rq, struct task_struct *prev) { - if (prev->se.on_rq) + if (prev->on_rq) update_rq_clock(rq); prev->sched_class->put_prev_task(rq, prev); } @@ -4126,7 +4128,9 @@ need_resched: if (to_wakeup) try_to_wake_up_local(to_wakeup); } + deactivate_task(rq, prev, DEQUEUE_SLEEP); + prev->on_rq = 0; /* * If we are going to sleep and we have plugged IO queued, make @@ -4695,7 +4699,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; - on_rq = p->se.on_rq; + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) dequeue_task(rq, p, 0); @@ -4743,7 +4747,7 @@ void set_user_nice(struct task_struct *p, long nice) p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } - on_rq = p->se.on_rq; + on_rq = p->on_rq; if (on_rq) dequeue_task(rq, p, 0); @@ -4877,8 +4881,6 @@ static struct task_struct *find_process_by_pid(pid_t pid) static void __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) { - BUG_ON(p->se.on_rq); - p->policy = policy; p->rt_priority = prio; p->normal_prio = normal_prio(p); @@ -5044,7 +5046,7 @@ recheck: raw_spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } - on_rq = p->se.on_rq; + on_rq = p->on_rq; running = task_current(rq, p); if (on_rq) deactivate_task(rq, p, 0); @@ -5965,7 +5967,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * If we're not on a rq, the next wake-up will ensure we're * placed properly. */ - if (p->se.on_rq) { + if (p->on_rq) { deactivate_task(rq_src, p, 0); set_task_cpu(p, dest_cpu); activate_task(rq_dest, p, 0); @@ -8339,7 +8341,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p) int old_prio = p->prio; int on_rq; - on_rq = p->se.on_rq; + on_rq = p->on_rq; if (on_rq) deactivate_task(rq, p, 0); __setscheduler(rq, p, SCHED_NORMAL, 0); @@ -8682,7 +8684,7 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); running = task_current(rq, tsk); - on_rq = tsk->se.on_rq; + on_rq = tsk->on_rq; if (on_rq) dequeue_task(rq, tsk, 0); diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 7bacd83a415..3669bec6e13 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) read_lock_irqsave(&tasklist_lock, flags); do_each_thread(g, p) { - if (!p->se.on_rq || task_cpu(p) != rq_cpu) + if (!p->on_rq || task_cpu(p) != rq_cpu) continue; print_task(m, rq, p); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index e7cebdc65f8..9ca4f5f879c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1136,7 +1136,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (p->se.on_rq && p->rt.nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1287,7 +1287,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || task_running(rq, task) || - !task->se.on_rq)) { + !task->on_rq)) { raw_spin_unlock(&lowest_rq->lock); lowest_rq = NULL; @@ -1321,7 +1321,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(task_current(rq, p)); BUG_ON(p->rt.nr_cpus_allowed <= 1); - BUG_ON(!p->se.on_rq); + BUG_ON(!p->on_rq); BUG_ON(!rt_task(p)); return p; @@ -1467,7 +1467,7 @@ static int pull_rt_task(struct rq *this_rq) */ if (p && (p->prio < this_rq->rt.highest_prio.curr)) { WARN_ON(p == src_rq->curr); - WARN_ON(!p->se.on_rq); + WARN_ON(!p->on_rq); /* * There's a chance that p is higher in priority @@ -1538,7 +1538,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, * Update the migration status of the RQ if we have an RT task * which is running AND changing its weight value. */ - if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { + if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { struct rq *rq = task_rq(p); if (!task_current(rq, p)) { @@ -1608,7 +1608,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) * we may need to handle the pulling of RT tasks * now. */ - if (p->se.on_rq && !rq->rt.rt_nr_running) + if (p->on_rq && !rq->rt.rt_nr_running) pull_rt_task(rq); } @@ -1638,7 +1638,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * If that current running task is also an RT task * then see if we can move to another run queue. */ - if (p->se.on_rq && rq->curr != p) { + if (p->on_rq && rq->curr != p) { #ifdef CONFIG_SMP if (rq->rt.overloaded && push_rt_task(rq) && /* Don't resched if we changed runqueues */ @@ -1657,7 +1657,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) static void prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) { - if (!p->se.on_rq) + if (!p->on_rq) return; if (rq->curr == p) { diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index 1ba2bd40fda..f607de42e6f 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -26,7 +26,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq) { struct task_struct *stop = rq->stop; - if (stop && stop->se.on_rq) + if (stop && stop->on_rq) return stop; return NULL; -- cgit v1.2.3-70-g09d2 From 7608dec2ce2004c234339bef8c8074e5e601d0e9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:46 +0200 Subject: sched: Drop the rq argument to sched_class::select_task_rq() In preparation of calling select_task_rq() without rq->lock held, drop the dependency on the rq argument. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.031077745@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +-- kernel/sched.c | 20 +++++++++++--------- kernel/sched_fair.c | 2 +- kernel/sched_idletask.c | 2 +- kernel/sched_rt.c | 38 ++++++++++++++++++++++++++------------ kernel/sched_stoptask.c | 3 +-- 6 files changed, 41 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index b33a700652d..ff4e2f9c24a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1067,8 +1067,7 @@ struct sched_class { void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP - int (*select_task_rq)(struct rq *rq, struct task_struct *p, - int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); diff --git a/kernel/sched.c b/kernel/sched.c index d398f2f0a3c..d4b815d345b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2195,13 +2195,15 @@ static int migration_cpu_stop(void *data); * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static bool migrate_task(struct task_struct *p, struct rq *rq) +static bool need_migrate_task(struct task_struct *p) { /* * If the task is not on a runqueue (and not running), then * the next wake-up will properly place the task. */ - return p->on_rq || task_running(rq, p); + bool running = p->on_rq || p->on_cpu; + smp_rmb(); /* finish_lock_switch() */ + return running; } /* @@ -2376,9 +2378,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. */ static inline -int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) { - int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); /* * In order not to call set_task_cpu() on a blocking task we need @@ -2533,7 +2535,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, en_flags |= ENQUEUE_WAKING; } - cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (cpu != orig_cpu) set_task_cpu(p, cpu); __task_rq_unlock(rq); @@ -2744,7 +2746,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * We set TASK_WAKING so that select_task_rq() can drop rq->lock * without people poking at ->cpus_allowed. */ - cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); + cpu = select_task_rq(p, SD_BALANCE_FORK, 0); set_task_cpu(p, cpu); p->state = TASK_RUNNING; @@ -3474,7 +3476,7 @@ void sched_exec(void) int dest_cpu; rq = task_rq_lock(p, &flags); - dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) goto unlock; @@ -3482,7 +3484,7 @@ void sched_exec(void) * select_task_rq() can race against ->cpus_allowed */ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && - likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { + likely(cpu_active(dest_cpu)) && need_migrate_task(p)) { struct migration_arg arg = { p, dest_cpu }; task_rq_unlock(rq, &flags); @@ -5911,7 +5913,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); - if (migrate_task(p, rq)) { + if (need_migrate_task(p)) { struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ __task_rq_unlock(rq); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4ee50f0af8d..96b2c95ac35 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1657,7 +1657,7 @@ static int select_idle_sibling(struct task_struct *p, int target) * preempt must be disabled. */ static int -select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) { struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; int cpu = smp_processor_id(); diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index a776a639642..0a51882534e 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -7,7 +7,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 9ca4f5f879c..19ecb312737 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -977,13 +977,23 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) { + struct task_struct *curr; + struct rq *rq; + int cpu; + if (sd_flag != SD_BALANCE_WAKE) return smp_processor_id(); + cpu = task_cpu(p); + rq = cpu_rq(cpu); + + rcu_read_lock(); + curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + /* - * If the current task is an RT task, then + * If the current task on @p's runqueue is an RT task, then * try to see if we can wake this RT task up on another * runqueue. Otherwise simply start this RT task * on its current runqueue. @@ -997,21 +1007,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags) * lock? * * For equal prio tasks, we just let the scheduler sort it out. + * + * Otherwise, just let it ride on the affined RQ and the + * post-schedule router will push the preempted task away + * + * This test is optimistic, if we get it wrong the load-balancer + * will have to sort it out. */ - if (unlikely(rt_task(rq->curr)) && - (rq->curr->rt.nr_cpus_allowed < 2 || - rq->curr->prio < p->prio) && + if (curr && unlikely(rt_task(curr)) && + (curr->rt.nr_cpus_allowed < 2 || + curr->prio < p->prio) && (p->rt.nr_cpus_allowed > 1)) { - int cpu = find_lowest_rq(p); + int target = find_lowest_rq(p); - return (cpu == -1) ? task_cpu(p) : cpu; + if (target != -1) + cpu = target; } + rcu_read_unlock(); - /* - * Otherwise, just let it ride on the affined RQ and the - * post-schedule router will push the preempted task away - */ - return task_cpu(p); + return cpu; } static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c index f607de42e6f..6f437632afa 100644 --- a/kernel/sched_stoptask.c +++ b/kernel/sched_stoptask.c @@ -9,8 +9,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct rq *rq, struct task_struct *p, - int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } -- cgit v1.2.3-70-g09d2 From 74f8e4b2335de45485b8d5b31a504747f13c8070 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:47 +0200 Subject: sched: Remove rq argument to sched_class::task_waking() In preparation of calling this without rq->lock held, remove the dependency on the rq argument. Reviewed-by: Frank Rowand Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110405152729.071474242@chello.nl Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 +++++++--- kernel/sched.c | 2 +- kernel/sched_fair.c | 4 +++- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index ff4e2f9c24a..7f5732f8c61 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1048,8 +1048,12 @@ struct sched_domain; #define WF_FORK 0x02 /* child wakeup after fork */ #define ENQUEUE_WAKEUP 1 -#define ENQUEUE_WAKING 2 -#define ENQUEUE_HEAD 4 +#define ENQUEUE_HEAD 2 +#ifdef CONFIG_SMP +#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */ +#else +#define ENQUEUE_WAKING 0 +#endif #define DEQUEUE_SLEEP 1 @@ -1071,7 +1075,7 @@ struct sched_class { void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); - void (*task_waking) (struct rq *this_rq, struct task_struct *task); + void (*task_waking) (struct task_struct *task); void (*task_woken) (struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, diff --git a/kernel/sched.c b/kernel/sched.c index d4b815d345b..46f42cac4eb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2531,7 +2531,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, p->state = TASK_WAKING; if (p->sched_class->task_waking) { - p->sched_class->task_waking(rq, p); + p->sched_class->task_waking(p); en_flags |= ENQUEUE_WAKING; } diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 96b2c95ac35..ad4c414f456 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1372,11 +1372,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP -static void task_waking_fair(struct rq *rq, struct task_struct *p) +static void task_waking_fair(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + lockdep_assert_held(&task_rq(p)->lock); + se->vruntime -= cfs_rq->min_vruntime; } -- cgit v1.2.3-70-g09d2 From a8e4f2eaecc9bfa4954adf79a04f4f22fddd829c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:49 +0200 Subject: sched: Delay task_contributes_to_load() In prepratation of having to call task_contributes_to_load() without holding rq->lock, we need to store the result until we do and can update the rq accounting accordingly. Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.151523907@chello.nl --- include/linux/sched.h | 1 + kernel/sched.c | 16 ++++------------ 2 files changed, 5 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 7f5732f8c61..25c50317ddc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1273,6 +1273,7 @@ struct task_struct { /* Revert to default priority/policy when forking */ unsigned sched_reset_on_fork:1; + unsigned sched_contributes_to_load:1; pid_t pid; pid_t tgid; diff --git a/kernel/sched.c b/kernel/sched.c index 7a5eb262078..fd32b78c123 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2519,18 +2519,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, if (unlikely(task_running(rq, p))) goto out_activate; - /* - * In order to handle concurrent wakeups and release the rq->lock - * we put the task in TASK_WAKING state. - * - * First fix up the nr_uninterruptible count: - */ - if (task_contributes_to_load(p)) { - if (likely(cpu_online(orig_cpu))) - rq->nr_uninterruptible--; - else - this_rq()->nr_uninterruptible--; - } + p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; if (p->sched_class->task_waking) { @@ -2555,6 +2544,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, WARN_ON(task_cpu(p) != cpu); WARN_ON(p->state != TASK_WAKING); + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; + out_activate: #endif /* CONFIG_SMP */ ttwu_activate(rq, p, en_flags); -- cgit v1.2.3-70-g09d2 From 317f394160e9beb97d19a84c39b7e5eb3d7815a8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 Apr 2011 17:23:58 +0200 Subject: sched: Move the second half of ttwu() to the remote cpu Now that we've removed the rq->lock requirement from the first part of ttwu() and can compute placement without holding any rq->lock, ensure we execute the second half of ttwu() on the actual cpu we want the task to run on. This avoids having to take rq->lock and doing the task enqueue remotely, saving lots on cacheline transfers. As measured using: http://oss.oracle.com/~mason/sembench.c $ for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor ; do echo performance > $i; done $ echo 4096 32000 64 128 > /proc/sys/kernel/sem $ ./sembench -t 2048 -w 1900 -o 0 unpatched: run time 30 seconds 647278 worker burns per second patched: run time 30 seconds 816715 worker burns per second Reviewed-by: Frank Rowand Cc: Mike Galbraith Cc: Nick Piggin Cc: Linus Torvalds Cc: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110405152729.515897185@chello.nl --- include/linux/sched.h | 3 ++- init/Kconfig | 5 +++++ kernel/sched.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched_features.h | 6 ++++++ 4 files changed, 69 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 25c50317ddc..e09dafa6e14 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1203,6 +1203,7 @@ struct task_struct { int lock_depth; /* BKL lock depth */ #ifdef CONFIG_SMP + struct task_struct *wake_entry; int on_cpu; #endif int on_rq; @@ -2192,7 +2193,7 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP -static inline void scheduler_ipi(void) { } +void scheduler_ipi(void); extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else static inline void scheduler_ipi(void) { } diff --git a/init/Kconfig b/init/Kconfig index 56240e724d9..32745bfe059 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -827,6 +827,11 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. +config SCHED_TTWU_QUEUE + bool + depends on !SPARC32 + default y + config MM_OWNER bool diff --git a/kernel/sched.c b/kernel/sched.c index 7d8b85fcdf0..9e3ede120e8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -556,6 +556,10 @@ struct rq { unsigned int ttwu_count; unsigned int ttwu_local; #endif + +#ifdef CONFIG_SMP + struct task_struct *wake_list; +#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -2516,10 +2520,61 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) return ret; } +#ifdef CONFIG_SMP +static void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + raw_spin_lock(&rq->lock); + + while (list) { + struct task_struct *p = list; + list = list->wake_entry; + ttwu_do_activate(rq, p, 0); + } + + raw_spin_unlock(&rq->lock); +} + +void scheduler_ipi(void) +{ + sched_ttwu_pending(); +} + +static void ttwu_queue_remote(struct task_struct *p, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *next = rq->wake_list; + + for (;;) { + struct task_struct *old = next; + + p->wake_entry = next; + next = cmpxchg(&rq->wake_list, old, p); + if (next == old) + break; + } + + if (!next) + smp_send_reschedule(cpu); +} +#endif + static void ttwu_queue(struct task_struct *p, int cpu) { struct rq *rq = cpu_rq(cpu); +#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE) + if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { + ttwu_queue_remote(p, cpu); + return; + } +#endif + raw_spin_lock(&rq->lock); ttwu_do_activate(rq, p, 0); raw_spin_unlock(&rq->lock); @@ -6331,6 +6386,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) #ifdef CONFIG_HOTPLUG_CPU case CPU_DYING: + sched_ttwu_pending(); /* Update our root-domain */ raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 68e69acc29b..be40f7371ee 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1) * Decrement CPU power based on irq activity */ SCHED_FEAT(NONIRQ_POWER, 1) + +/* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ +SCHED_FEAT(TTWU_QUEUE, 1) -- cgit v1.2.3-70-g09d2 From 625f2a378e5a10f45fdc37932fc9f8a21676de9e Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Fri, 22 Apr 2011 11:19:10 -0600 Subject: sched: Get rid of lock_depth Neil Brown pointed out that lock_depth somehow escaped the BKL removal work. Let's get rid of it now. Note that the perf scripting utilities still have a bunch of code for dealing with common_lock_depth in tracepoints; I have left that in place in case anybody wants to use that code with older kernels. Suggested-by: Neil Brown Signed-off-by: Jonathan Corbet Cc: Arnd Bergmann Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/20110422111910.456c0e84@bike.lwn.net Signed-off-by: Ingo Molnar --- Documentation/trace/kprobetrace.txt | 1 - include/linux/init_task.h | 1 - include/linux/sched.h | 6 ------ kernel/fork.c | 1 - kernel/mutex.c | 7 ------- kernel/sched.c | 11 +---------- kernel/sched_debug.c | 4 ---- kernel/trace/trace_kprobe.c | 1 - tools/perf/Documentation/perf-script-perl.txt | 1 - tools/perf/Documentation/perf-script-python.txt | 1 - 10 files changed, 1 insertion(+), 33 deletions(-) (limited to 'include') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index 6d27ab8d6e9..c83bd6b4e6e 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -120,7 +120,6 @@ format: field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1;signed:0; field:int common_pid; offset:4; size:4; signed:1; - field:int common_lock_depth; offset:8; size:4; signed:1; field:unsigned long __probe_ip; offset:12; size:4; signed:0; field:int __probe_nargs; offset:16; size:4; signed:1; diff --git a/include/linux/init_task.h b/include/linux/init_task.h index caa151fbebb..689496bb665 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -134,7 +134,6 @@ extern struct cred init_cred; .stack = &init_thread_info, \ .usage = ATOMIC_INIT(2), \ .flags = PF_KTHREAD, \ - .lock_depth = -1, \ .prio = MAX_PRIO-20, \ .static_prio = MAX_PRIO-20, \ .normal_prio = MAX_PRIO-20, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 171ba24b08a..013314a5610 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -731,10 +731,6 @@ struct sched_info { /* timestamps */ unsigned long long last_arrival,/* when we last ran on a cpu */ last_queued; /* when we were last queued to run */ -#ifdef CONFIG_SCHEDSTATS - /* BKL stats */ - unsigned int bkl_count; -#endif }; #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ @@ -1190,8 +1186,6 @@ struct task_struct { unsigned int flags; /* per process flags, defined below */ unsigned int ptrace; - int lock_depth; /* BKL lock depth */ - #ifdef CONFIG_SMP struct task_struct *wake_entry; int on_cpu; diff --git a/kernel/fork.c b/kernel/fork.c index e7548dee636..aca62871a4f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1103,7 +1103,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, posix_cpu_timers_init(p); - p->lock_depth = -1; /* -1 = no lock */ do_posix_clock_monotonic_gettime(&p->start_time); p->real_start_time = p->start_time; monotonic_to_bootbased(&p->real_start_time); diff --git a/kernel/mutex.c b/kernel/mutex.c index fe4706cb0c5..2c938e2337c 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -162,13 +162,6 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, for (;;) { struct task_struct *owner; - /* - * If we own the BKL, then don't spin. The owner of - * the mutex might be waiting on us to release the BKL. - */ - if (unlikely(current->lock_depth >= 0)) - break; - /* * If there's an owner, wait for it to either * release the lock or go to sleep. diff --git a/kernel/sched.c b/kernel/sched.c index 8cb0a5769a1..9cde2dd229c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4121,12 +4121,6 @@ static inline void schedule_debug(struct task_struct *prev) profile_hit(SCHED_PROFILING, __builtin_return_address(0)); schedstat_inc(this_rq(), sched_count); -#ifdef CONFIG_SCHEDSTATS - if (unlikely(prev->lock_depth >= 0)) { - schedstat_inc(this_rq(), rq_sched_info.bkl_count); - schedstat_inc(prev, sched_info.bkl_count); - } -#endif } static void put_prev_task(struct rq *rq, struct task_struct *prev) @@ -5852,11 +5846,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) raw_spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#if defined(CONFIG_PREEMPT) - task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); -#else task_thread_info(idle)->preempt_count = 0; -#endif + /* * The idle tasks have their own, simple scheduling class: */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 3669bec6e13..a6710a112b4 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu) P(ttwu_count); P(ttwu_local); - SEQ_printf(m, " .%-30s: %d\n", "bkl_count", - rq->rq_sched_info.bkl_count); - #undef P #undef P64 #endif @@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.statistics.wait_count); PN(se.statistics.iowait_sum); P(se.statistics.iowait_count); - P(sched_info.bkl_count); P(se.nr_migrations); P(se.statistics.nr_migrations_cold); P(se.statistics.nr_failed_migrations_affine); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 35d55a38614..f925c45f0af 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -53,7 +53,6 @@ const char *reserved_field_names[] = { "common_preempt_count", "common_pid", "common_tgid", - "common_lock_depth", FIELD_STRING_IP, FIELD_STRING_RETIP, FIELD_STRING_FUNC, diff --git a/tools/perf/Documentation/perf-script-perl.txt b/tools/perf/Documentation/perf-script-perl.txt index 5bb41e55a3a..3152cca1550 100644 --- a/tools/perf/Documentation/perf-script-perl.txt +++ b/tools/perf/Documentation/perf-script-perl.txt @@ -63,7 +63,6 @@ The format file for the sched_wakep event defines the following fields field:unsigned char common_flags; field:unsigned char common_preempt_count; field:int common_pid; - field:int common_lock_depth; field:char comm[TASK_COMM_LEN]; field:pid_t pid; diff --git a/tools/perf/Documentation/perf-script-python.txt b/tools/perf/Documentation/perf-script-python.txt index 36b38277422..47102206911 100644 --- a/tools/perf/Documentation/perf-script-python.txt +++ b/tools/perf/Documentation/perf-script-python.txt @@ -463,7 +463,6 @@ The format file for the sched_wakep event defines the following fields field:unsigned char common_flags; field:unsigned char common_preempt_count; field:int common_pid; - field:int common_lock_depth; field:char comm[TASK_COMM_LEN]; field:pid_t pid; -- cgit v1.2.3-70-g09d2 From 3e51e3edfd81bfd9853ad7de91167e4ce33d0fe7 Mon Sep 17 00:00:00 2001 From: Samir Bellabes Date: Wed, 11 May 2011 18:18:05 +0200 Subject: sched: Remove unused parameters from sched_fork() and wake_up_new_task() sched_fork() and wake_up_new_task() are defined with a parameter 'unsigned long clone_flags', which is unused. This patch removes the parameters. Signed-off-by: Samir Bellabes Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1305130685-1047-1-git-send-email-sam@synack.fr Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 ++--- kernel/fork.c | 4 ++-- kernel/sched.c | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6b4280b23ee..12211e1666e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2051,14 +2051,13 @@ extern void xtime_update(unsigned long ticks); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); -extern void wake_up_new_task(struct task_struct *tsk, - unsigned long clone_flags); +extern void wake_up_new_task(struct task_struct *tsk); #ifdef CONFIG_SMP extern void kick_process(struct task_struct *tsk); #else static inline void kick_process(struct task_struct *tsk) { } #endif -extern void sched_fork(struct task_struct *p, int clone_flags); +extern void sched_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); diff --git a/kernel/fork.c b/kernel/fork.c index aca62871a4f..2b44d82b823 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1152,7 +1152,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(p, clone_flags); + sched_fork(p); retval = perf_event_init_task(p); if (retval) @@ -1463,7 +1463,7 @@ long do_fork(unsigned long clone_flags, */ p->flags &= ~PF_STARTING; - wake_up_new_task(p, clone_flags); + wake_up_new_task(p); tracehook_report_clone_complete(trace, regs, clone_flags, nr, p); diff --git a/kernel/sched.c b/kernel/sched.c index da933815048..f9778c0d91e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2741,7 +2741,7 @@ static void __sched_fork(struct task_struct *p) /* * fork()/clone()-time setup: */ -void sched_fork(struct task_struct *p, int clone_flags) +void sched_fork(struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); @@ -2823,7 +2823,7 @@ void sched_fork(struct task_struct *p, int clone_flags) * that must be done for every newly created context, then puts the task * on the runqueue and wakes it. */ -void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) +void wake_up_new_task(struct task_struct *p) { unsigned long flags; struct rq *rq; -- cgit v1.2.3-70-g09d2