diff options
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/hardirq.h | 6 | ||||
-rw-r--r-- | include/linux/kernel.h | 5 | ||||
-rw-r--r-- | include/linux/sched.h | 94 | ||||
-rw-r--r-- | include/linux/topology.h | 168 | ||||
-rw-r--r-- | include/trace/events/sched.h | 95 |
5 files changed, 260 insertions, 108 deletions
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 330cb31bb49..6d527ee82b2 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -64,6 +64,12 @@ #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #define NMI_OFFSET (1UL << NMI_SHIFT) +#ifndef PREEMPT_ACTIVE +#define PREEMPT_ACTIVE_BITS 1 +#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) +#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) +#endif + #if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) #error PREEMPT_ACTIVE is too low! #endif diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d6320a3e8de..2b5b1e0899a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -125,7 +125,7 @@ extern int _cond_resched(void); #endif #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP - void __might_sleep(char *file, int line); + void __might_sleep(char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep * @@ -137,8 +137,9 @@ extern int _cond_resched(void); * supposed to. */ # define might_sleep() \ - do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) + do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) #else + static inline void __might_sleep(char *file, int line, int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 379531c0897..f3d74bd04d1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -38,6 +38,8 @@ #define SCHED_BATCH 3 /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 +/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ +#define SCHED_RESET_ON_FORK 0x40000000 #ifdef __KERNEL__ @@ -796,18 +798,19 @@ enum cpu_idle_type { #define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE #ifdef CONFIG_SMP -#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ -#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ -#define SD_BALANCE_EXEC 4 /* Balance on exec */ -#define SD_BALANCE_FORK 8 /* Balance on fork, clone */ -#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ -#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ -#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ -#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ -#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ -#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ -#define SD_SERIALIZE 1024 /* Only a single load balancing instance */ -#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */ +#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ +#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ +#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ +#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ +#define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */ +#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ +#define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */ +#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ +#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ +#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ +#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ +#define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */ +#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ enum powersavings_balance_level { POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ @@ -827,7 +830,7 @@ static inline int sd_balance_for_mc_power(void) if (sched_smt_power_savings) return SD_POWERSAVINGS_BALANCE; - return 0; + return SD_PREFER_SIBLING; } static inline int sd_balance_for_package_power(void) @@ -835,7 +838,7 @@ static inline int sd_balance_for_package_power(void) if (sched_mc_power_savings | sched_smt_power_savings) return SD_POWERSAVINGS_BALANCE; - return 0; + return SD_PREFER_SIBLING; } /* @@ -857,15 +860,9 @@ struct sched_group { /* * CPU power of this group, SCHED_LOAD_SCALE being max power for a - * single CPU. This is read only (except for setup, hotplug CPU). - * Note : Never change cpu_power without recompute its reciprocal - */ - unsigned int __cpu_power; - /* - * reciprocal value of cpu_power to avoid expensive divides - * (see include/linux/reciprocal_div.h) + * single CPU. */ - u32 reciprocal_cpu_power; + unsigned int cpu_power; /* * The CPUs this group covers. @@ -918,6 +915,7 @@ struct sched_domain { unsigned int newidle_idx; unsigned int wake_idx; unsigned int forkexec_idx; + unsigned int smt_gain; int flags; /* See SD_* */ enum sched_domain_level level; @@ -1045,7 +1043,6 @@ struct sched_class { struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); - int (*needs_post_schedule) (struct rq *this_rq); void (*post_schedule) (struct rq *this_rq); void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); @@ -1110,6 +1107,8 @@ struct sched_entity { u64 wait_max; u64 wait_count; u64 wait_sum; + u64 iowait_count; + u64 iowait_sum; u64 sleep_start; u64 sleep_max; @@ -1234,11 +1233,19 @@ struct task_struct { unsigned did_exec:1; unsigned in_execve:1; /* Tell the LSMs that the process is doing an * execve */ + unsigned in_iowait:1; + + + /* Revert to default priority/policy when forking */ + unsigned sched_reset_on_fork:1; + pid_t pid; pid_t tgid; +#ifdef CONFIG_CC_STACKPROTECTOR /* Canary value for the -fstack-protector gcc feature */ unsigned long stack_canary; +#endif /* * pointers to (original) parent process, youngest child, younger sibling, @@ -1840,11 +1847,12 @@ extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_shares_ratelimit; extern unsigned int sysctl_sched_shares_thresh; -#ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_child_runs_first; +#ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; +extern unsigned int sysctl_sched_time_avg; extern unsigned int sysctl_timer_migration; int sched_nr_latency_handler(struct ctl_table *table, int write, @@ -2308,23 +2316,31 @@ static inline int need_resched(void) * cond_resched_softirq() will enable bhs before scheduling. */ extern int _cond_resched(void); -#ifdef CONFIG_PREEMPT_BKL -static inline int cond_resched(void) -{ - return 0; -} + +#define cond_resched() ({ \ + __might_sleep(__FILE__, __LINE__, 0); \ + _cond_resched(); \ +}) + +extern int __cond_resched_lock(spinlock_t *lock); + +#ifdef CONFIG_PREEMPT +#define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET #else -static inline int cond_resched(void) -{ - return _cond_resched(); -} +#define PREEMPT_LOCK_OFFSET 0 #endif -extern int cond_resched_lock(spinlock_t * lock); -extern int cond_resched_softirq(void); -static inline int cond_resched_bkl(void) -{ - return _cond_resched(); -} + +#define cond_resched_lock(lock) ({ \ + __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ + __cond_resched_lock(lock); \ +}) + +extern int __cond_resched_softirq(void); + +#define cond_resched_softirq() ({ \ + __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \ + __cond_resched_softirq(); \ +}) /* * Does a critical section need to be broken due to another diff --git a/include/linux/topology.h b/include/linux/topology.h index 7402c1a27c4..85e8cf7d393 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -85,20 +85,29 @@ int arch_update_cpu_topology(void); #define ARCH_HAS_SCHED_WAKE_IDLE /* Common values for SMT siblings */ #ifndef SD_SIBLING_INIT -#define SD_SIBLING_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 2, \ - .busy_factor = 64, \ - .imbalance_pct = 110, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_NEWIDLE \ - | SD_BALANCE_FORK \ - | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE \ - | SD_WAKE_BALANCE \ - | SD_SHARE_CPUPOWER, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ +#define SD_SIBLING_INIT (struct sched_domain) { \ + .min_interval = 1, \ + .max_interval = 2, \ + .busy_factor = 64, \ + .imbalance_pct = 110, \ + \ + .flags = 1*SD_LOAD_BALANCE \ + | 1*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_EXEC \ + | 1*SD_BALANCE_FORK \ + | 0*SD_WAKE_IDLE \ + | 1*SD_WAKE_AFFINE \ + | 1*SD_WAKE_BALANCE \ + | 1*SD_SHARE_CPUPOWER \ + | 0*SD_POWERSAVINGS_BALANCE \ + | 0*SD_SHARE_PKG_RESOURCES \ + | 0*SD_SERIALIZE \ + | 0*SD_WAKE_IDLE_FAR \ + | 0*SD_PREFER_SIBLING \ + , \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .smt_gain = 1178, /* 15% */ \ } #endif #endif /* CONFIG_SCHED_SMT */ @@ -106,69 +115,94 @@ int arch_update_cpu_topology(void); #ifdef CONFIG_SCHED_MC /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ #ifndef SD_MC_INIT -#define SD_MC_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .wake_idx = 1, \ - .forkexec_idx = 1, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_FORK \ - | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE \ - | SD_WAKE_BALANCE \ - | SD_SHARE_PKG_RESOURCES\ - | sd_balance_for_mc_power()\ - | sd_power_saving_flags(),\ - .last_balance = jiffies, \ - .balance_interval = 1, \ +#define SD_MC_INIT (struct sched_domain) { \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_nice_tries = 1, \ + .busy_idx = 2, \ + .wake_idx = 1, \ + .forkexec_idx = 1, \ + \ + .flags = 1*SD_LOAD_BALANCE \ + | 1*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_EXEC \ + | 1*SD_BALANCE_FORK \ + | 1*SD_WAKE_IDLE \ + | 1*SD_WAKE_AFFINE \ + | 1*SD_WAKE_BALANCE \ + | 0*SD_SHARE_CPUPOWER \ + | 1*SD_SHARE_PKG_RESOURCES \ + | 0*SD_SERIALIZE \ + | 0*SD_WAKE_IDLE_FAR \ + | sd_balance_for_mc_power() \ + | sd_power_saving_flags() \ + , \ + .last_balance = jiffies, \ + .balance_interval = 1, \ } #endif #endif /* CONFIG_SCHED_MC */ /* Common values for CPUs */ #ifndef SD_CPU_INIT -#define SD_CPU_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 2, \ - .wake_idx = 1, \ - .forkexec_idx = 1, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_EXEC \ - | SD_BALANCE_FORK \ - | SD_WAKE_AFFINE \ - | SD_WAKE_BALANCE \ - | sd_balance_for_package_power()\ - | sd_power_saving_flags(),\ - .last_balance = jiffies, \ - .balance_interval = 1, \ +#define SD_CPU_INIT (struct sched_domain) { \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_nice_tries = 1, \ + .busy_idx = 2, \ + .idle_idx = 1, \ + .newidle_idx = 2, \ + .wake_idx = 1, \ + .forkexec_idx = 1, \ + \ + .flags = 1*SD_LOAD_BALANCE \ + | 1*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_EXEC \ + | 1*SD_BALANCE_FORK \ + | 1*SD_WAKE_IDLE \ + | 0*SD_WAKE_AFFINE \ + | 1*SD_WAKE_BALANCE \ + | 0*SD_SHARE_CPUPOWER \ + | 0*SD_SHARE_PKG_RESOURCES \ + | 0*SD_SERIALIZE \ + | 0*SD_WAKE_IDLE_FAR \ + | sd_balance_for_package_power() \ + | sd_power_saving_flags() \ + , \ + .last_balance = jiffies, \ + .balance_interval = 1, \ } #endif /* sched_domains SD_ALLNODES_INIT for NUMA machines */ -#define SD_ALLNODES_INIT (struct sched_domain) { \ - .min_interval = 64, \ - .max_interval = 64*num_online_cpus(), \ - .busy_factor = 128, \ - .imbalance_pct = 133, \ - .cache_nice_tries = 1, \ - .busy_idx = 3, \ - .idle_idx = 3, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_NEWIDLE \ - | SD_WAKE_AFFINE \ - | SD_SERIALIZE, \ - .last_balance = jiffies, \ - .balance_interval = 64, \ +#define SD_ALLNODES_INIT (struct sched_domain) { \ + .min_interval = 64, \ + .max_interval = 64*num_online_cpus(), \ + .busy_factor = 128, \ + .imbalance_pct = 133, \ + .cache_nice_tries = 1, \ + .busy_idx = 3, \ + .idle_idx = 3, \ + .flags = 1*SD_LOAD_BALANCE \ + | 1*SD_BALANCE_NEWIDLE \ + | 0*SD_BALANCE_EXEC \ + | 0*SD_BALANCE_FORK \ + | 0*SD_WAKE_IDLE \ + | 1*SD_WAKE_AFFINE \ + | 0*SD_WAKE_BALANCE \ + | 0*SD_SHARE_CPUPOWER \ + | 0*SD_POWERSAVINGS_BALANCE \ + | 0*SD_SHARE_PKG_RESOURCES \ + | 1*SD_SERIALIZE \ + | 1*SD_WAKE_IDLE_FAR \ + | 0*SD_PREFER_SIBLING \ + , \ + .last_balance = jiffies, \ + .balance_interval = 64, \ } #ifdef CONFIG_NUMA diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 8949bb7eb08..a4c369ec328 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -340,6 +340,101 @@ TRACE_EVENT(sched_signal_send, __entry->sig, __entry->comm, __entry->pid) ); +/* + * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE + * adding sched_stat support to SCHED_FIFO/RR would be welcome. + */ + +/* + * Tracepoint for accounting wait time (time the task is runnable + * but not actually running due to scheduler contention). + */ +TRACE_EVENT(sched_stat_wait, + + TP_PROTO(struct task_struct *tsk, u64 delay), + + TP_ARGS(tsk, delay), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( u64, delay ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->delay = delay; + ) + TP_perf_assign( + __perf_count(delay); + ), + + TP_printk("task: %s:%d wait: %Lu [ns]", + __entry->comm, __entry->pid, + (unsigned long long)__entry->delay) +); + +/* + * Tracepoint for accounting sleep time (time the task is not runnable, + * including iowait, see below). + */ +TRACE_EVENT(sched_stat_sleep, + + TP_PROTO(struct task_struct *tsk, u64 delay), + + TP_ARGS(tsk, delay), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( u64, delay ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->delay = delay; + ) + TP_perf_assign( + __perf_count(delay); + ), + + TP_printk("task: %s:%d sleep: %Lu [ns]", + __entry->comm, __entry->pid, + (unsigned long long)__entry->delay) +); + +/* + * Tracepoint for accounting iowait time (time the task is not runnable + * due to waiting on IO to complete). + */ +TRACE_EVENT(sched_stat_iowait, + + TP_PROTO(struct task_struct *tsk, u64 delay), + + TP_ARGS(tsk, delay), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( u64, delay ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->delay = delay; + ) + TP_perf_assign( + __perf_count(delay); + ), + + TP_printk("task: %s:%d iowait: %Lu [ns]", + __entry->comm, __entry->pid, + (unsigned long long)__entry->delay) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ |