summaryrefslogtreecommitdiffstats
path: root/include/linux/sched.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/sched.h')
-rw-r--r--include/linux/sched.h378
1 files changed, 332 insertions, 46 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6682da36b29..a781dec1cd0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -16,12 +16,14 @@ struct sched_param {
#include <linux/types.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
+#include <linux/plist.h>
#include <linux/rbtree.h>
#include <linux/thread_info.h>
#include <linux/cpumask.h>
#include <linux/errno.h>
#include <linux/nodemask.h>
#include <linux/mm_types.h>
+#include <linux/preempt_mask.h>
#include <asm/page.h>
#include <asm/ptrace.h>
@@ -55,6 +57,70 @@ struct sched_param {
#include <asm/processor.h>
+#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
+
+/*
+ * Extended scheduling parameters data structure.
+ *
+ * This is needed because the original struct sched_param can not be
+ * altered without introducing ABI issues with legacy applications
+ * (e.g., in sched_getparam()).
+ *
+ * However, the possibility of specifying more than just a priority for
+ * the tasks may be useful for a wide variety of application fields, e.g.,
+ * multimedia, streaming, automation and control, and many others.
+ *
+ * This variant (sched_attr) is meant at describing a so-called
+ * sporadic time-constrained task. In such model a task is specified by:
+ * - the activation period or minimum instance inter-arrival time;
+ * - the maximum (or average, depending on the actual scheduling
+ * discipline) computation time of all instances, a.k.a. runtime;
+ * - the deadline (relative to the actual activation time) of each
+ * instance.
+ * Very briefly, a periodic (sporadic) task asks for the execution of
+ * some specific computation --which is typically called an instance--
+ * (at most) every period. Moreover, each instance typically lasts no more
+ * than the runtime and must be completed by time instant t equal to
+ * the instance activation time + the deadline.
+ *
+ * This is reflected by the actual fields of the sched_attr structure:
+ *
+ * @size size of the structure, for fwd/bwd compat.
+ *
+ * @sched_policy task's scheduling policy
+ * @sched_flags for customizing the scheduler behaviour
+ * @sched_nice task's nice value (SCHED_NORMAL/BATCH)
+ * @sched_priority task's static priority (SCHED_FIFO/RR)
+ * @sched_deadline representative of the task's deadline
+ * @sched_runtime representative of the task's runtime
+ * @sched_period representative of the task's period
+ *
+ * Given this task model, there are a multiplicity of scheduling algorithms
+ * and policies, that can be used to ensure all the tasks will make their
+ * timing constraints.
+ *
+ * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
+ * only user of this new interface. More information about the algorithm
+ * available in the scheduling class file or in Documentation/.
+ */
+struct sched_attr {
+ u32 size;
+
+ u32 sched_policy;
+ u64 sched_flags;
+
+ /* SCHED_NORMAL, SCHED_BATCH */
+ s32 sched_nice;
+
+ /* SCHED_FIFO, SCHED_RR */
+ u32 sched_priority;
+
+ /* SCHED_DEADLINE */
+ u64 sched_runtime;
+ u64 sched_deadline;
+ u64 sched_period;
+};
+
struct exec_domain;
struct futex_pi_state;
struct robust_list_head;
@@ -62,6 +128,7 @@ struct bio_list;
struct fs_struct;
struct perf_event_context;
struct blk_plug;
+struct filename;
/*
* List of flags we want to share for kernel threads,
@@ -163,11 +230,10 @@ extern char ___assert_task_state[1 - 2*!!(
/* get_task_state() */
#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
- __TASK_TRACED)
+ __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
-#define task_is_dead(task) ((task)->exit_state != 0)
#define task_is_stopped_or_traced(task) \
((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
#define task_contributes_to_load(task) \
@@ -285,6 +351,14 @@ static inline void lockup_detector_init(void)
}
#endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+void reset_hung_task_detector(void);
+#else
+static inline void reset_hung_task_detector(void)
+{
+}
+#endif
+
/* Attach to any functions which should be ignored in wchan output. */
#define __sched __attribute__((__section__(".sched.text")))
@@ -318,18 +392,33 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
#endif
-
-extern void set_dumpable(struct mm_struct *mm, int value);
-extern int get_dumpable(struct mm_struct *mm);
+#define SUID_DUMP_DISABLE 0 /* No setuid dumping */
+#define SUID_DUMP_USER 1 /* Dump as user of process */
+#define SUID_DUMP_ROOT 2 /* Dump as root */
/* mm flags */
-/* dumpable bits */
-#define MMF_DUMPABLE 0 /* core dump is permitted */
-#define MMF_DUMP_SECURELY 1 /* core file is readable only by root */
+/* for SUID_DUMP_* above */
#define MMF_DUMPABLE_BITS 2
#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
+extern void set_dumpable(struct mm_struct *mm, int value);
+/*
+ * This returns the actual value of the suid_dumpable flag. For things
+ * that are using this for checking for privilege transitions, it must
+ * test against SUID_DUMP_USER rather than treating it as a boolean
+ * value.
+ */
+static inline int __get_dumpable(unsigned long mm_flags)
+{
+ return mm_flags & MMF_DUMPABLE_MASK;
+}
+
+static inline int get_dumpable(struct mm_struct *mm)
+{
+ return __get_dumpable(mm->flags);
+}
+
/* coredump filter bits */
#define MMF_DUMP_ANON_PRIVATE 2
#define MMF_DUMP_ANON_SHARED 3
@@ -427,6 +516,12 @@ struct task_cputime {
.sum_exec_runtime = 0, \
}
+#ifdef CONFIG_PREEMPT_COUNT
+#define PREEMPT_DISABLED (1 + PREEMPT_ENABLED)
+#else
+#define PREEMPT_DISABLED PREEMPT_ENABLED
+#endif
+
/*
* Disable preemption until the scheduler is running.
* Reset by start_kernel()->sched_init()->init_idle().
@@ -434,7 +529,7 @@ struct task_cputime {
* We include PREEMPT_ACTIVE to avoid cond_resched() from working
* before the scheduler is active -- see should_resched().
*/
-#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE)
+#define INIT_PREEMPT_COUNT (PREEMPT_DISABLED + PREEMPT_ACTIVE)
/**
* struct thread_group_cputimer - thread group interval timer counts
@@ -466,6 +561,7 @@ struct signal_struct {
atomic_t sigcnt;
atomic_t live;
int nr_threads;
+ struct list_head thread_head;
wait_queue_head_t wait_chldexit; /* for wait4() */
@@ -768,6 +864,7 @@ enum cpu_idle_type {
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
+#define SD_NUMA 0x4000 /* cross-node balancing */
extern int __weak arch_sd_sibiling_asym_packing(void);
@@ -809,7 +906,9 @@ struct sched_domain {
unsigned int balance_interval; /* initialise to 1. units in ms. */
unsigned int nr_balance_failed; /* initialise to 0 */
- u64 last_update;
+ /* idle_balance() stats */
+ u64 max_newidle_lb_cost;
+ unsigned long next_decay_max_lb_cost;
#ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
@@ -908,7 +1007,8 @@ struct pipe_inode_info;
struct uts_namespace;
struct load_weight {
- unsigned long weight, inv_weight;
+ unsigned long weight;
+ u32 inv_weight;
};
struct sched_avg {
@@ -1006,6 +1106,51 @@ struct sched_rt_entity {
#endif
};
+struct sched_dl_entity {
+ struct rb_node rb_node;
+
+ /*
+ * Original scheduling parameters. Copied here from sched_attr
+ * during sched_setscheduler2(), they will remain the same until
+ * the next sched_setscheduler2().
+ */
+ u64 dl_runtime; /* maximum runtime for each instance */
+ u64 dl_deadline; /* relative deadline of each instance */
+ u64 dl_period; /* separation of two instances (period) */
+ u64 dl_bw; /* dl_runtime / dl_deadline */
+
+ /*
+ * Actual scheduling parameters. Initialized with the values above,
+ * they are continously updated during task execution. Note that
+ * the remaining runtime could be < 0 in case we are in overrun.
+ */
+ s64 runtime; /* remaining runtime for this instance */
+ u64 deadline; /* absolute deadline for this instance */
+ unsigned int flags; /* specifying the scheduler behaviour */
+
+ /*
+ * Some bool flags:
+ *
+ * @dl_throttled tells if we exhausted the runtime. If so, the
+ * task has to wait for a replenishment to be performed at the
+ * next firing of dl_timer.
+ *
+ * @dl_new tells if a new instance arrived. If so we must
+ * start executing it with full runtime and reset its absolute
+ * deadline;
+ *
+ * @dl_boosted tells if we are boosted due to DI. If so we are
+ * outside bandwidth enforcement mechanism (but only until we
+ * exit the critical section).
+ */
+ int dl_throttled, dl_new, dl_boosted;
+
+ /*
+ * Bandwidth enforcement timer. Each -deadline task has its
+ * own bandwidth to be enforced, thus we need one timer per task.
+ */
+ struct hrtimer dl_timer;
+};
struct rcu_node;
@@ -1029,6 +1174,8 @@ struct task_struct {
struct task_struct *last_wakee;
unsigned long wakee_flips;
unsigned long wakee_flip_decay_ts;
+
+ int wake_cpu;
#endif
int on_rq;
@@ -1040,21 +1187,13 @@ struct task_struct {
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
+ struct sched_dl_entity dl;
#ifdef CONFIG_PREEMPT_NOTIFIERS
/* list of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
#endif
- /*
- * fpu_counter contains the number of consecutive context switches
- * that the FPU is used. If this is over a threshold, the lazy fpu
- * saving becomes unlazy to save the trap. This is an unsigned char
- * so that after 256 times the counter wraps and the behavior turns
- * lazy again; this to deal with bursty apps that only use FPU for
- * a short time
- */
- unsigned char fpu_counter;
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif
@@ -1082,6 +1221,7 @@ struct task_struct {
struct list_head tasks;
#ifdef CONFIG_SMP
struct plist_node pushable_tasks;
+ struct rb_node pushable_dl_tasks;
#endif
struct mm_struct *mm, *active_mm;
@@ -1100,7 +1240,6 @@ struct task_struct {
/* Used for emulating ABI behavior of previous Linux versions */
unsigned int personality;
- unsigned did_exec:1;
unsigned in_execve:1; /* Tell the LSMs that the process is doing an
* execve */
unsigned in_iowait:1;
@@ -1144,6 +1283,7 @@ struct task_struct {
/* PID/PID hash table linkage. */
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group;
+ struct list_head thread_node;
struct completion *vfork_done; /* for vfork() */
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
@@ -1233,9 +1373,12 @@ struct task_struct {
#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task */
- struct plist_head pi_waiters;
+ struct rb_root pi_waiters;
+ struct rb_node *pi_waiters_leftmost;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
+ /* Top pi_waiters task */
+ struct task_struct *pi_top_task;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
@@ -1324,10 +1467,41 @@ struct task_struct {
#endif
#ifdef CONFIG_NUMA_BALANCING
int numa_scan_seq;
- int numa_migrate_seq;
unsigned int numa_scan_period;
+ unsigned int numa_scan_period_max;
+ int numa_preferred_nid;
+ int numa_migrate_deferred;
+ unsigned long numa_migrate_retry;
u64 node_stamp; /* migration stamp */
struct callback_head numa_work;
+
+ struct list_head numa_entry;
+ struct numa_group *numa_group;
+
+ /*
+ * Exponential decaying average of faults on a per-node basis.
+ * Scheduling placement decisions are made based on the these counts.
+ * The values remain static for the duration of a PTE scan
+ */
+ unsigned long *numa_faults;
+ unsigned long total_numa_faults;
+
+ /*
+ * numa_faults_buffer records faults per node during the current
+ * scan window. When the scan completes, the counts in numa_faults
+ * decay and these values are copied.
+ */
+ unsigned long *numa_faults_buffer;
+
+ /*
+ * numa_faults_locality tracks if faults recorded during the last
+ * scan window were remote/local. The task scan period is adapted
+ * based on the locality of the faults with different weights
+ * depending on whether they were shared or private faults
+ */
+ unsigned long numa_faults_locality[2];
+
+ unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
struct rcu_head rcu;
@@ -1394,11 +1568,10 @@ struct task_struct {
} memcg_batch;
unsigned int memcg_kmem_skip_account;
struct memcg_oom_info {
+ struct mem_cgroup *memcg;
+ gfp_t gfp_mask;
+ int order;
unsigned int may_oom:1;
- unsigned int in_memcg_oom:1;
- unsigned int oom_locked:1;
- int wakeups;
- struct mem_cgroup *wait_on_memcg;
} memcg_oom;
#endif
#ifdef CONFIG_UPROBES
@@ -1413,16 +1586,33 @@ struct task_struct {
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
+#define TNF_MIGRATED 0x01
+#define TNF_NO_GROUP 0x02
+#define TNF_SHARED 0x04
+#define TNF_FAULT_LOCAL 0x08
+
#ifdef CONFIG_NUMA_BALANCING
-extern void task_numa_fault(int node, int pages, bool migrated);
+extern void task_numa_fault(int last_node, int node, int pages, int flags);
+extern pid_t task_numa_group_id(struct task_struct *p);
extern void set_numabalancing_state(bool enabled);
+extern void task_numa_free(struct task_struct *p);
+
+extern unsigned int sysctl_numa_balancing_migrate_deferred;
#else
-static inline void task_numa_fault(int node, int pages, bool migrated)
+static inline void task_numa_fault(int last_node, int node, int pages,
+ int flags)
{
}
+static inline pid_t task_numa_group_id(struct task_struct *p)
+{
+ return 0;
+}
static inline void set_numabalancing_state(bool enabled)
{
}
+static inline void task_numa_free(struct task_struct *p)
+{
+}
#endif
static inline struct pid *task_pid(struct task_struct *task)
@@ -1817,7 +2007,9 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
* but then during bootup it turns out that sched_clock()
* is reliable after all:
*/
-extern int sched_clock_stable;
+extern int sched_clock_stable(void);
+extern void set_sched_clock_stable(void);
+extern void clear_sched_clock_stable(void);
extern void sched_clock_tick(void);
extern void sched_clock_idle_sleep_event(void);
@@ -1896,6 +2088,8 @@ extern int sched_setscheduler(struct task_struct *, int,
const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int,
const struct sched_param *);
+extern int sched_setattr(struct task_struct *,
+ const struct sched_attr *);
extern struct task_struct *idle_task(int cpu);
/**
* is_idle_task - is the specified task an idle task?
@@ -1975,7 +2169,7 @@ extern void wake_up_new_task(struct task_struct *tsk);
#else
static inline void kick_process(struct task_struct *tsk) { }
#endif
-extern void sched_fork(struct task_struct *p);
+extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
extern void sched_dead(struct task_struct *p);
extern void proc_caches_init(void);
@@ -2101,8 +2295,6 @@ extern struct mm_struct *get_task_mm(struct task_struct *task);
extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
/* Remove the current tasks stale references to the old mm_struct */
extern void mm_release(struct task_struct *, struct mm_struct *);
-/* Allocate a new mm structure and copy contents from tsk->mm */
-extern struct mm_struct *dup_mm(struct task_struct *tsk);
extern int copy_thread(unsigned long, unsigned long, unsigned long,
struct task_struct *);
@@ -2120,7 +2312,7 @@ extern void do_group_exit(int);
extern int allow_signal(int);
extern int disallow_signal(int);
-extern int do_execve(const char *,
+extern int do_execve(struct filename *,
const char __user * const __user *,
const char __user * const __user *);
extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
@@ -2160,6 +2352,16 @@ extern bool current_is_single_threaded(void);
#define while_each_thread(g, t) \
while ((t = next_thread(t)) != g)
+#define __for_each_thread(signal, t) \
+ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
+
+#define for_each_thread(p, t) \
+ __for_each_thread((p)->signal, t)
+
+/* Careful: this is a double loop, 'break' won't work as expected. */
+#define for_each_process_thread(p, t) \
+ for_each_process(p) for_each_thread(p, t)
+
static inline int get_nr_threads(struct task_struct *tsk)
{
return tsk->signal->nr_threads;
@@ -2402,11 +2604,6 @@ static inline int signal_pending_state(long state, struct task_struct *p)
return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
}
-static inline int need_resched(void)
-{
- return unlikely(test_thread_flag(TIF_NEED_RESCHED));
-}
-
/*
* cond_resched() and cond_resched_lock(): latency reduction via
* explicit rescheduling in places that are safe. The return
@@ -2475,36 +2672,120 @@ static inline int tsk_is_polling(struct task_struct *p)
{
return task_thread_info(p)->status & TS_POLLING;
}
-static inline void current_set_polling(void)
+static inline void __current_set_polling(void)
{
current_thread_info()->status |= TS_POLLING;
}
-static inline void current_clr_polling(void)
+static inline bool __must_check current_set_polling_and_test(void)
+{
+ __current_set_polling();
+
+ /*
+ * Polling state must be visible before we test NEED_RESCHED,
+ * paired by resched_task()
+ */
+ smp_mb();
+
+ return unlikely(tif_need_resched());
+}
+
+static inline void __current_clr_polling(void)
{
current_thread_info()->status &= ~TS_POLLING;
- smp_mb__after_clear_bit();
+}
+
+static inline bool __must_check current_clr_polling_and_test(void)
+{
+ __current_clr_polling();
+
+ /*
+ * Polling state must be visible before we test NEED_RESCHED,
+ * paired by resched_task()
+ */
+ smp_mb();
+
+ return unlikely(tif_need_resched());
}
#elif defined(TIF_POLLING_NRFLAG)
static inline int tsk_is_polling(struct task_struct *p)
{
return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
}
-static inline void current_set_polling(void)
+
+static inline void __current_set_polling(void)
{
set_thread_flag(TIF_POLLING_NRFLAG);
}
-static inline void current_clr_polling(void)
+static inline bool __must_check current_set_polling_and_test(void)
+{
+ __current_set_polling();
+
+ /*
+ * Polling state must be visible before we test NEED_RESCHED,
+ * paired by resched_task()
+ *
+ * XXX: assumes set/clear bit are identical barrier wise.
+ */
+ smp_mb__after_clear_bit();
+
+ return unlikely(tif_need_resched());
+}
+
+static inline void __current_clr_polling(void)
{
clear_thread_flag(TIF_POLLING_NRFLAG);
}
+
+static inline bool __must_check current_clr_polling_and_test(void)
+{
+ __current_clr_polling();
+
+ /*
+ * Polling state must be visible before we test NEED_RESCHED,
+ * paired by resched_task()
+ */
+ smp_mb__after_clear_bit();
+
+ return unlikely(tif_need_resched());
+}
+
#else
static inline int tsk_is_polling(struct task_struct *p) { return 0; }
-static inline void current_set_polling(void) { }
-static inline void current_clr_polling(void) { }
+static inline void __current_set_polling(void) { }
+static inline void __current_clr_polling(void) { }
+
+static inline bool __must_check current_set_polling_and_test(void)
+{
+ return unlikely(tif_need_resched());
+}
+static inline bool __must_check current_clr_polling_and_test(void)
+{
+ return unlikely(tif_need_resched());
+}
#endif
+static inline void current_clr_polling(void)
+{
+ __current_clr_polling();
+
+ /*
+ * Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
+ * Once the bit is cleared, we'll get IPIs with every new
+ * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
+ * fold.
+ */
+ smp_mb(); /* paired with resched_task() */
+
+ preempt_fold_need_resched();
+}
+
+static __always_inline bool need_resched(void)
+{
+ return unlikely(tif_need_resched());
+}
+
/*
* Thread group CPU time accounting.
*/
@@ -2546,6 +2827,11 @@ static inline unsigned int task_cpu(const struct task_struct *p)
return task_thread_info(p)->cpu;
}
+static inline int task_node(const struct task_struct *p)
+{
+ return cpu_to_node(task_cpu(p));
+}
+
extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
#else