diff options
Diffstat (limited to 'include/linux/sched.h')
-rw-r--r-- | include/linux/sched.h | 378 |
1 files changed, 332 insertions, 46 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 6682da36b29..a781dec1cd0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -16,12 +16,14 @@ struct sched_param { #include <linux/types.h> #include <linux/timex.h> #include <linux/jiffies.h> +#include <linux/plist.h> #include <linux/rbtree.h> #include <linux/thread_info.h> #include <linux/cpumask.h> #include <linux/errno.h> #include <linux/nodemask.h> #include <linux/mm_types.h> +#include <linux/preempt_mask.h> #include <asm/page.h> #include <asm/ptrace.h> @@ -55,6 +57,70 @@ struct sched_param { #include <asm/processor.h> +#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ + +/* + * Extended scheduling parameters data structure. + * + * This is needed because the original struct sched_param can not be + * altered without introducing ABI issues with legacy applications + * (e.g., in sched_getparam()). + * + * However, the possibility of specifying more than just a priority for + * the tasks may be useful for a wide variety of application fields, e.g., + * multimedia, streaming, automation and control, and many others. + * + * This variant (sched_attr) is meant at describing a so-called + * sporadic time-constrained task. In such model a task is specified by: + * - the activation period or minimum instance inter-arrival time; + * - the maximum (or average, depending on the actual scheduling + * discipline) computation time of all instances, a.k.a. runtime; + * - the deadline (relative to the actual activation time) of each + * instance. + * Very briefly, a periodic (sporadic) task asks for the execution of + * some specific computation --which is typically called an instance-- + * (at most) every period. Moreover, each instance typically lasts no more + * than the runtime and must be completed by time instant t equal to + * the instance activation time + the deadline. + * + * This is reflected by the actual fields of the sched_attr structure: + * + * @size size of the structure, for fwd/bwd compat. + * + * @sched_policy task's scheduling policy + * @sched_flags for customizing the scheduler behaviour + * @sched_nice task's nice value (SCHED_NORMAL/BATCH) + * @sched_priority task's static priority (SCHED_FIFO/RR) + * @sched_deadline representative of the task's deadline + * @sched_runtime representative of the task's runtime + * @sched_period representative of the task's period + * + * Given this task model, there are a multiplicity of scheduling algorithms + * and policies, that can be used to ensure all the tasks will make their + * timing constraints. + * + * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the + * only user of this new interface. More information about the algorithm + * available in the scheduling class file or in Documentation/. + */ +struct sched_attr { + u32 size; + + u32 sched_policy; + u64 sched_flags; + + /* SCHED_NORMAL, SCHED_BATCH */ + s32 sched_nice; + + /* SCHED_FIFO, SCHED_RR */ + u32 sched_priority; + + /* SCHED_DEADLINE */ + u64 sched_runtime; + u64 sched_deadline; + u64 sched_period; +}; + struct exec_domain; struct futex_pi_state; struct robust_list_head; @@ -62,6 +128,7 @@ struct bio_list; struct fs_struct; struct perf_event_context; struct blk_plug; +struct filename; /* * List of flags we want to share for kernel threads, @@ -163,11 +230,10 @@ extern char ___assert_task_state[1 - 2*!!( /* get_task_state() */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ - __TASK_TRACED) + __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD) #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) -#define task_is_dead(task) ((task)->exit_state != 0) #define task_is_stopped_or_traced(task) \ ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) #define task_contributes_to_load(task) \ @@ -285,6 +351,14 @@ static inline void lockup_detector_init(void) } #endif +#ifdef CONFIG_DETECT_HUNG_TASK +void reset_hung_task_detector(void); +#else +static inline void reset_hung_task_detector(void) +{ +} +#endif + /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) @@ -318,18 +392,33 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} #endif - -extern void set_dumpable(struct mm_struct *mm, int value); -extern int get_dumpable(struct mm_struct *mm); +#define SUID_DUMP_DISABLE 0 /* No setuid dumping */ +#define SUID_DUMP_USER 1 /* Dump as user of process */ +#define SUID_DUMP_ROOT 2 /* Dump as root */ /* mm flags */ -/* dumpable bits */ -#define MMF_DUMPABLE 0 /* core dump is permitted */ -#define MMF_DUMP_SECURELY 1 /* core file is readable only by root */ +/* for SUID_DUMP_* above */ #define MMF_DUMPABLE_BITS 2 #define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) +extern void set_dumpable(struct mm_struct *mm, int value); +/* + * This returns the actual value of the suid_dumpable flag. For things + * that are using this for checking for privilege transitions, it must + * test against SUID_DUMP_USER rather than treating it as a boolean + * value. + */ +static inline int __get_dumpable(unsigned long mm_flags) +{ + return mm_flags & MMF_DUMPABLE_MASK; +} + +static inline int get_dumpable(struct mm_struct *mm) +{ + return __get_dumpable(mm->flags); +} + /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 #define MMF_DUMP_ANON_SHARED 3 @@ -427,6 +516,12 @@ struct task_cputime { .sum_exec_runtime = 0, \ } +#ifdef CONFIG_PREEMPT_COUNT +#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED) +#else +#define PREEMPT_DISABLED PREEMPT_ENABLED +#endif + /* * Disable preemption until the scheduler is running. * Reset by start_kernel()->sched_init()->init_idle(). @@ -434,7 +529,7 @@ struct task_cputime { * We include PREEMPT_ACTIVE to avoid cond_resched() from working * before the scheduler is active -- see should_resched(). */ -#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE) +#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE) /** * struct thread_group_cputimer - thread group interval timer counts @@ -466,6 +561,7 @@ struct signal_struct { atomic_t sigcnt; atomic_t live; int nr_threads; + struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ @@ -768,6 +864,7 @@ enum cpu_idle_type { #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ +#define SD_NUMA 0x4000 /* cross-node balancing */ extern int __weak arch_sd_sibiling_asym_packing(void); @@ -809,7 +906,9 @@ struct sched_domain { unsigned int balance_interval; /* initialise to 1. units in ms. */ unsigned int nr_balance_failed; /* initialise to 0 */ - u64 last_update; + /* idle_balance() stats */ + u64 max_newidle_lb_cost; + unsigned long next_decay_max_lb_cost; #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ @@ -908,7 +1007,8 @@ struct pipe_inode_info; struct uts_namespace; struct load_weight { - unsigned long weight, inv_weight; + unsigned long weight; + u32 inv_weight; }; struct sched_avg { @@ -1006,6 +1106,51 @@ struct sched_rt_entity { #endif }; +struct sched_dl_entity { + struct rb_node rb_node; + + /* + * Original scheduling parameters. Copied here from sched_attr + * during sched_setscheduler2(), they will remain the same until + * the next sched_setscheduler2(). + */ + u64 dl_runtime; /* maximum runtime for each instance */ + u64 dl_deadline; /* relative deadline of each instance */ + u64 dl_period; /* separation of two instances (period) */ + u64 dl_bw; /* dl_runtime / dl_deadline */ + + /* + * Actual scheduling parameters. Initialized with the values above, + * they are continously updated during task execution. Note that + * the remaining runtime could be < 0 in case we are in overrun. + */ + s64 runtime; /* remaining runtime for this instance */ + u64 deadline; /* absolute deadline for this instance */ + unsigned int flags; /* specifying the scheduler behaviour */ + + /* + * Some bool flags: + * + * @dl_throttled tells if we exhausted the runtime. If so, the + * task has to wait for a replenishment to be performed at the + * next firing of dl_timer. + * + * @dl_new tells if a new instance arrived. If so we must + * start executing it with full runtime and reset its absolute + * deadline; + * + * @dl_boosted tells if we are boosted due to DI. If so we are + * outside bandwidth enforcement mechanism (but only until we + * exit the critical section). + */ + int dl_throttled, dl_new, dl_boosted; + + /* + * Bandwidth enforcement timer. Each -deadline task has its + * own bandwidth to be enforced, thus we need one timer per task. + */ + struct hrtimer dl_timer; +}; struct rcu_node; @@ -1029,6 +1174,8 @@ struct task_struct { struct task_struct *last_wakee; unsigned long wakee_flips; unsigned long wakee_flip_decay_ts; + + int wake_cpu; #endif int on_rq; @@ -1040,21 +1187,13 @@ struct task_struct { #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif + struct sched_dl_entity dl; #ifdef CONFIG_PREEMPT_NOTIFIERS /* list of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif - /* - * fpu_counter contains the number of consecutive context switches - * that the FPU is used. If this is over a threshold, the lazy fpu - * saving becomes unlazy to save the trap. This is an unsigned char - * so that after 256 times the counter wraps and the behavior turns - * lazy again; this to deal with bursty apps that only use FPU for - * a short time - */ - unsigned char fpu_counter; #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif @@ -1082,6 +1221,7 @@ struct task_struct { struct list_head tasks; #ifdef CONFIG_SMP struct plist_node pushable_tasks; + struct rb_node pushable_dl_tasks; #endif struct mm_struct *mm, *active_mm; @@ -1100,7 +1240,6 @@ struct task_struct { /* Used for emulating ABI behavior of previous Linux versions */ unsigned int personality; - unsigned did_exec:1; unsigned in_execve:1; /* Tell the LSMs that the process is doing an * execve */ unsigned in_iowait:1; @@ -1144,6 +1283,7 @@ struct task_struct { /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; struct list_head thread_group; + struct list_head thread_node; struct completion *vfork_done; /* for vfork() */ int __user *set_child_tid; /* CLONE_CHILD_SETTID */ @@ -1233,9 +1373,12 @@ struct task_struct { #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ - struct plist_head pi_waiters; + struct rb_root pi_waiters; + struct rb_node *pi_waiters_leftmost; /* Deadlock detection and priority inheritance handling */ struct rt_mutex_waiter *pi_blocked_on; + /* Top pi_waiters task */ + struct task_struct *pi_top_task; #endif #ifdef CONFIG_DEBUG_MUTEXES @@ -1324,10 +1467,41 @@ struct task_struct { #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; - int numa_migrate_seq; unsigned int numa_scan_period; + unsigned int numa_scan_period_max; + int numa_preferred_nid; + int numa_migrate_deferred; + unsigned long numa_migrate_retry; u64 node_stamp; /* migration stamp */ struct callback_head numa_work; + + struct list_head numa_entry; + struct numa_group *numa_group; + + /* + * Exponential decaying average of faults on a per-node basis. + * Scheduling placement decisions are made based on the these counts. + * The values remain static for the duration of a PTE scan + */ + unsigned long *numa_faults; + unsigned long total_numa_faults; + + /* + * numa_faults_buffer records faults per node during the current + * scan window. When the scan completes, the counts in numa_faults + * decay and these values are copied. + */ + unsigned long *numa_faults_buffer; + + /* + * numa_faults_locality tracks if faults recorded during the last + * scan window were remote/local. The task scan period is adapted + * based on the locality of the faults with different weights + * depending on whether they were shared or private faults + */ + unsigned long numa_faults_locality[2]; + + unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ struct rcu_head rcu; @@ -1394,11 +1568,10 @@ struct task_struct { } memcg_batch; unsigned int memcg_kmem_skip_account; struct memcg_oom_info { + struct mem_cgroup *memcg; + gfp_t gfp_mask; + int order; unsigned int may_oom:1; - unsigned int in_memcg_oom:1; - unsigned int oom_locked:1; - int wakeups; - struct mem_cgroup *wait_on_memcg; } memcg_oom; #endif #ifdef CONFIG_UPROBES @@ -1413,16 +1586,33 @@ struct task_struct { /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +#define TNF_MIGRATED 0x01 +#define TNF_NO_GROUP 0x02 +#define TNF_SHARED 0x04 +#define TNF_FAULT_LOCAL 0x08 + #ifdef CONFIG_NUMA_BALANCING -extern void task_numa_fault(int node, int pages, bool migrated); +extern void task_numa_fault(int last_node, int node, int pages, int flags); +extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); +extern void task_numa_free(struct task_struct *p); + +extern unsigned int sysctl_numa_balancing_migrate_deferred; #else -static inline void task_numa_fault(int node, int pages, bool migrated) +static inline void task_numa_fault(int last_node, int node, int pages, + int flags) { } +static inline pid_t task_numa_group_id(struct task_struct *p) +{ + return 0; +} static inline void set_numabalancing_state(bool enabled) { } +static inline void task_numa_free(struct task_struct *p) +{ +} #endif static inline struct pid *task_pid(struct task_struct *task) @@ -1817,7 +2007,9 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns) * but then during bootup it turns out that sched_clock() * is reliable after all: */ -extern int sched_clock_stable; +extern int sched_clock_stable(void); +extern void set_sched_clock_stable(void); +extern void clear_sched_clock_stable(void); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); @@ -1896,6 +2088,8 @@ extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); +extern int sched_setattr(struct task_struct *, + const struct sched_attr *); extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? @@ -1975,7 +2169,7 @@ extern void wake_up_new_task(struct task_struct *tsk); #else static inline void kick_process(struct task_struct *tsk) { } #endif -extern void sched_fork(struct task_struct *p); +extern int sched_fork(unsigned long clone_flags, struct task_struct *p); extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); @@ -2101,8 +2295,6 @@ extern struct mm_struct *get_task_mm(struct task_struct *task); extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); -/* Allocate a new mm structure and copy contents from tsk->mm */ -extern struct mm_struct *dup_mm(struct task_struct *tsk); extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *); @@ -2120,7 +2312,7 @@ extern void do_group_exit(int); extern int allow_signal(int); extern int disallow_signal(int); -extern int do_execve(const char *, +extern int do_execve(struct filename *, const char __user * const __user *, const char __user * const __user *); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); @@ -2160,6 +2352,16 @@ extern bool current_is_single_threaded(void); #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) +#define __for_each_thread(signal, t) \ + list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) + +#define for_each_thread(p, t) \ + __for_each_thread((p)->signal, t) + +/* Careful: this is a double loop, 'break' won't work as expected. */ +#define for_each_process_thread(p, t) \ + for_each_process(p) for_each_thread(p, t) + static inline int get_nr_threads(struct task_struct *tsk) { return tsk->signal->nr_threads; @@ -2402,11 +2604,6 @@ static inline int signal_pending_state(long state, struct task_struct *p) return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); } -static inline int need_resched(void) -{ - return unlikely(test_thread_flag(TIF_NEED_RESCHED)); -} - /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return @@ -2475,36 +2672,120 @@ static inline int tsk_is_polling(struct task_struct *p) { return task_thread_info(p)->status & TS_POLLING; } -static inline void current_set_polling(void) +static inline void __current_set_polling(void) { current_thread_info()->status |= TS_POLLING; } -static inline void current_clr_polling(void) +static inline bool __must_check current_set_polling_and_test(void) +{ + __current_set_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + */ + smp_mb(); + + return unlikely(tif_need_resched()); +} + +static inline void __current_clr_polling(void) { current_thread_info()->status &= ~TS_POLLING; - smp_mb__after_clear_bit(); +} + +static inline bool __must_check current_clr_polling_and_test(void) +{ + __current_clr_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + */ + smp_mb(); + + return unlikely(tif_need_resched()); } #elif defined(TIF_POLLING_NRFLAG) static inline int tsk_is_polling(struct task_struct *p) { return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); } -static inline void current_set_polling(void) + +static inline void __current_set_polling(void) { set_thread_flag(TIF_POLLING_NRFLAG); } -static inline void current_clr_polling(void) +static inline bool __must_check current_set_polling_and_test(void) +{ + __current_set_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + * + * XXX: assumes set/clear bit are identical barrier wise. + */ + smp_mb__after_clear_bit(); + + return unlikely(tif_need_resched()); +} + +static inline void __current_clr_polling(void) { clear_thread_flag(TIF_POLLING_NRFLAG); } + +static inline bool __must_check current_clr_polling_and_test(void) +{ + __current_clr_polling(); + + /* + * Polling state must be visible before we test NEED_RESCHED, + * paired by resched_task() + */ + smp_mb__after_clear_bit(); + + return unlikely(tif_need_resched()); +} + #else static inline int tsk_is_polling(struct task_struct *p) { return 0; } -static inline void current_set_polling(void) { } -static inline void current_clr_polling(void) { } +static inline void __current_set_polling(void) { } +static inline void __current_clr_polling(void) { } + +static inline bool __must_check current_set_polling_and_test(void) +{ + return unlikely(tif_need_resched()); +} +static inline bool __must_check current_clr_polling_and_test(void) +{ + return unlikely(tif_need_resched()); +} #endif +static inline void current_clr_polling(void) +{ + __current_clr_polling(); + + /* + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. + * Once the bit is cleared, we'll get IPIs with every new + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also + * fold. + */ + smp_mb(); /* paired with resched_task() */ + + preempt_fold_need_resched(); +} + +static __always_inline bool need_resched(void) +{ + return unlikely(tif_need_resched()); +} + /* * Thread group CPU time accounting. */ @@ -2546,6 +2827,11 @@ static inline unsigned int task_cpu(const struct task_struct *p) return task_thread_info(p)->cpu; } +static inline int task_node(const struct task_struct *p) +{ + return cpu_to_node(task_cpu(p)); +} + extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #else |