summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile23
-rw-r--r--kernel/acct.c46
-rw-r--r--kernel/async.c2
-rw-r--r--kernel/audit.c13
-rw-r--r--kernel/audit.h6
-rw-r--r--kernel/auditfilter.c17
-rw-r--r--kernel/auditsc.c754
-rw-r--r--kernel/capability.c80
-rw-r--r--kernel/cgroup.c428
-rw-r--r--kernel/cgroup_freezer.c86
-rw-r--r--kernel/cpu.c8
-rw-r--r--kernel/cpuset.c134
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/callchain.c189
-rw-r--r--kernel/events/core.c473
-rw-r--r--kernel/events/hw_breakpoint.c4
-rw-r--r--kernel/events/internal.h42
-rw-r--r--kernel/events/ring_buffer.c5
-rw-r--r--kernel/exit.c62
-rw-r--r--kernel/fork.c65
-rw-r--r--kernel/freezer.c203
-rw-r--r--kernel/futex.c28
-rw-r--r--kernel/hrtimer.c6
-rw-r--r--kernel/hung_task.c14
-rw-r--r--kernel/irq/autoprobe.c4
-rw-r--r--kernel/irq/chip.c42
-rw-r--r--kernel/irq/internals.h4
-rw-r--r--kernel/irq/irqdomain.c15
-rw-r--r--kernel/irq/manage.c11
-rw-r--r--kernel/irq/spurious.c6
-rw-r--r--kernel/itimer.c15
-rw-r--r--kernel/jump_label.c54
-rw-r--r--kernel/kexec.c29
-rw-r--r--kernel/kmod.c27
-rw-r--r--kernel/kprobes.c10
-rw-r--r--kernel/kthread.c27
-rw-r--r--kernel/lockdep.c91
-rw-r--r--kernel/module.c205
-rw-r--r--kernel/panic.c43
-rw-r--r--kernel/params.c41
-rw-r--r--kernel/pid.c8
-rw-r--r--kernel/pid_namespace.c31
-rw-r--r--kernel/posix-cpu-timers.c132
-rw-r--r--kernel/power/hibernate.c104
-rw-r--r--kernel/power/main.c10
-rw-r--r--kernel/power/power.h26
-rw-r--r--kernel/power/process.c93
-rw-r--r--kernel/power/snapshot.c9
-rw-r--r--kernel/power/suspend.c12
-rw-r--r--kernel/power/swap.c14
-rw-r--r--kernel/power/user.c195
-rw-r--r--kernel/printk.c24
-rw-r--r--kernel/ptrace.c27
-rw-r--r--kernel/rcu.h7
-rw-r--r--kernel/rcupdate.c12
-rw-r--r--kernel/rcutiny.c149
-rw-r--r--kernel/rcutiny_plugin.h29
-rw-r--r--kernel/rcutorture.c229
-rw-r--r--kernel/rcutree.c290
-rw-r--r--kernel/rcutree.h26
-rw-r--r--kernel/rcutree_plugin.h289
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/relay.c12
-rw-r--r--kernel/res_counter.c28
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c37
-rw-r--r--kernel/rtmutex.c8
-rw-r--r--kernel/sched/Makefile20
-rw-r--r--kernel/sched/auto_group.c (renamed from kernel/sched_autogroup.c)33
-rw-r--r--kernel/sched/auto_group.h (renamed from kernel/sched_autogroup.h)26
-rw-r--r--kernel/sched/clock.c (renamed from kernel/sched_clock.c)0
-rw-r--r--kernel/sched/core.c (renamed from kernel/sched.c)2306
-rw-r--r--kernel/sched/cpupri.c (renamed from kernel/sched_cpupri.c)7
-rw-r--r--kernel/sched/cpupri.h (renamed from kernel/sched_cpupri.h)0
-rw-r--r--kernel/sched/debug.c (renamed from kernel/sched_debug.c)6
-rw-r--r--kernel/sched/fair.c (renamed from kernel/sched_fair.c)1155
-rw-r--r--kernel/sched/features.h (renamed from kernel/sched_features.h)29
-rw-r--r--kernel/sched/idle_task.c (renamed from kernel/sched_idletask.c)4
-rw-r--r--kernel/sched/rt.c (renamed from kernel/sched_rt.c)226
-rw-r--r--kernel/sched/sched.h1166
-rw-r--r--kernel/sched/stats.c111
-rw-r--r--kernel/sched/stats.h (renamed from kernel/sched_stats.h)109
-rw-r--r--kernel/sched/stop_task.c (renamed from kernel/sched_stoptask.c)4
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/signal.c82
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sys.c127
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clockevents.c1
-rw-r--r--kernel/time/clocksource.c111
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-sched.c105
-rw-r--r--kernel/time/timekeeping.c96
-rw-r--r--kernel/timer.c64
-rw-r--r--kernel/trace/blktrace.c2
-rw-r--r--kernel/trace/ftrace.c720
-rw-r--r--kernel/trace/trace.c108
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_events.c1
-rw-r--r--kernel/trace/trace_events_filter.c294
-rw-r--r--kernel/trace/trace_irqsoff.c13
-rw-r--r--kernel/trace/trace_output.c16
-rw-r--r--kernel/trace/trace_sched_wakeup.c13
-rw-r--r--kernel/trace/trace_stack.c30
-rw-r--r--kernel/tracepoint.c7
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/wait.c4
-rw-r--r--kernel/watchdog.c2
-rw-r--r--kernel/workqueue.c32
114 files changed, 7424 insertions, 4767 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e898c5b9d02..2d9de86b7e7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,16 +2,15 @@
# Makefile for the linux kernel.
#
-obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
+obj-y = fork.o exec_domain.o panic.o printk.o \
cpu.o exit.o itimer.o time.o softirq.o resource.o \
sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
- notifier.o ksysfs.o sched_clock.o cred.o \
- async.o range.o
-obj-y += groups.o
+ notifier.o ksysfs.o cred.o \
+ async.o range.o groups.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace debug files and internal ftrace files
@@ -20,10 +19,12 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg
CFLAGS_REMOVE_mutex-debug.o = -pg
CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
-CFLAGS_REMOVE_sched_clock.o = -pg
CFLAGS_REMOVE_irq_work.o = -pg
endif
+obj-y += sched/
+obj-y += power/
+
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
@@ -52,8 +53,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
-obj-$(CONFIG_PM) += power/
-obj-$(CONFIG_FREEZER) += power/
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
@@ -99,7 +98,6 @@ obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_X86_DS) += trace/
obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
-obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
@@ -110,15 +108,6 @@ obj-$(CONFIG_PADATA) += padata.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
-ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
-# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
-# needed for x86 only. Why this used to be enabled for all architectures is beyond
-# me. I suspect most platforms don't need this, but until we know that for sure
-# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
-# to get a correct value for the wait-channel (WCHAN in ps). --davidm
-CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
-endif
-
$(obj)/configs.o: $(obj)/config_data.h
# config_data.h contains the same information as ikconfig.h but gzipped.
diff --git a/kernel/acct.c b/kernel/acct.c
index fa7eb3de2dd..02e6167a53b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -84,11 +84,10 @@ static void do_acct_process(struct bsd_acct_struct *acct,
* the cache line to have the data after getting the lock.
*/
struct bsd_acct_struct {
- volatile int active;
- volatile int needcheck;
+ int active;
+ unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
- struct timer_list timer;
struct list_head list;
};
@@ -96,15 +95,6 @@ static DEFINE_SPINLOCK(acct_lock);
static LIST_HEAD(acct_list);
/*
- * Called whenever the timer says to check the free space.
- */
-static void acct_timeout(unsigned long x)
-{
- struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
- acct->needcheck = 1;
-}
-
-/*
* Check the amount of free space and suspend/resume accordingly.
*/
static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
@@ -112,12 +102,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
struct kstatfs sbuf;
int res;
int act;
- sector_t resume;
- sector_t suspend;
+ u64 resume;
+ u64 suspend;
spin_lock(&acct_lock);
res = acct->active;
- if (!file || !acct->needcheck)
+ if (!file || time_is_before_jiffies(acct->needcheck))
goto out;
spin_unlock(&acct_lock);
@@ -127,8 +117,8 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
suspend = sbuf.f_blocks * SUSPEND;
resume = sbuf.f_blocks * RESUME;
- sector_div(suspend, 100);
- sector_div(resume, 100);
+ do_div(suspend, 100);
+ do_div(resume, 100);
if (sbuf.f_bavail <= suspend)
act = -1;
@@ -160,10 +150,7 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
}
}
- del_timer(&acct->timer);
- acct->needcheck = 0;
- acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
- add_timer(&acct->timer);
+ acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
res = acct->active;
out:
spin_unlock(&acct_lock);
@@ -185,9 +172,7 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
if (acct->file) {
old_acct = acct->file;
old_ns = acct->ns;
- del_timer(&acct->timer);
acct->active = 0;
- acct->needcheck = 0;
acct->file = NULL;
acct->ns = NULL;
list_del(&acct->list);
@@ -195,13 +180,9 @@ static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
if (file) {
acct->file = file;
acct->ns = ns;
- acct->needcheck = 0;
+ acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
acct->active = 1;
list_add(&acct->list, &acct_list);
- /* It's been deleted if it was used before so this is safe */
- setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
- acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
- add_timer(&acct->timer);
}
if (old_acct) {
mnt_unpin(old_acct->f_path.mnt);
@@ -334,7 +315,7 @@ void acct_auto_close(struct super_block *sb)
spin_lock(&acct_lock);
restart:
list_for_each_entry(acct, &acct_list, list)
- if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
+ if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
acct_file_reopen(acct, NULL, NULL);
goto restart;
}
@@ -348,7 +329,6 @@ void acct_exit_ns(struct pid_namespace *ns)
if (acct == NULL)
return;
- del_timer_sync(&acct->timer);
spin_lock(&acct_lock);
if (acct->file != NULL)
acct_file_reopen(acct, NULL, NULL);
@@ -498,7 +478,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
*/
- memset((caddr_t)&ac, 0, sizeof(acct_t));
+ memset(&ac, 0, sizeof(acct_t));
ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER;
strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm));
@@ -613,8 +593,8 @@ void acct_collect(long exitcode, int group_dead)
pacct->ac_flag |= ACORE;
if (current->flags & PF_SIGNALED)
pacct->ac_flag |= AXSIG;
- pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
- pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
+ pacct->ac_utime += current->utime;
+ pacct->ac_stime += current->stime;
pacct->ac_minflt += current->min_flt;
pacct->ac_majflt += current->maj_flt;
spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/async.c b/kernel/async.c
index 80b74b88fef..bd0c168a3bb 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -78,8 +78,6 @@ static DECLARE_WAIT_QUEUE_HEAD(async_done);
static atomic_t entry_count;
-extern int initcall_debug;
-
/*
* MUST be called with the lock held!
diff --git a/kernel/audit.c b/kernel/audit.c
index 09fae2677a4..bb0eb5bb9a0 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -601,13 +601,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
case AUDIT_TTY_SET:
case AUDIT_TRIM:
case AUDIT_MAKE_EQUIV:
- if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
+ if (!capable(CAP_AUDIT_CONTROL))
err = -EPERM;
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
- if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
+ if (!capable(CAP_AUDIT_WRITE))
err = -EPERM;
break;
default: /* bad msg */
@@ -631,7 +631,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
}
*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
- audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u",
+ audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
pid, uid, auid, ses);
if (sid) {
rc = security_secid_to_secctx(sid, &ctx, &len);
@@ -1260,12 +1260,13 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
avail = audit_expand(ab,
max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
if (!avail)
- goto out;
+ goto out_va_end;
len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
}
- va_end(args2);
if (len > 0)
skb_put(skb, len);
+out_va_end:
+ va_end(args2);
out:
return;
}
@@ -1422,7 +1423,7 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
char *p, *pathname;
if (prefix)
- audit_log_format(ab, " %s", prefix);
+ audit_log_format(ab, "%s", prefix);
/* We will allow 11 spaces for ' (deleted)' to be appended */
pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
diff --git a/kernel/audit.h b/kernel/audit.h
index 91e7071c4d2..81676680337 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -36,12 +36,8 @@ enum audit_state {
AUDIT_DISABLED, /* Do not create per-task audit_context.
* No syscall-specific audit records can
* be generated. */
- AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context,
- * but don't necessarily fill it in at
- * syscall entry time (i.e., filter
- * instead). */
AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context,
- * and always fill it in at syscall
+ * and fill it in at syscall
* entry time. This makes a full
* syscall record available if some
* other part of the kernel decides it
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f8277c80d67..a6c3f1abd20 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -235,13 +235,15 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
switch(listnr) {
default:
goto exit_err;
- case AUDIT_FILTER_USER:
- case AUDIT_FILTER_TYPE:
#ifdef CONFIG_AUDITSYSCALL
case AUDIT_FILTER_ENTRY:
+ if (rule->action == AUDIT_ALWAYS)
+ goto exit_err;
case AUDIT_FILTER_EXIT:
case AUDIT_FILTER_TASK:
#endif
+ case AUDIT_FILTER_USER:
+ case AUDIT_FILTER_TYPE:
;
}
if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -385,7 +387,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
goto exit_free;
break;
case AUDIT_FILETYPE:
- if ((f->val & ~S_IFMT) > S_IFMT)
+ if (f->val & ~S_IFMT)
goto exit_free;
break;
case AUDIT_INODE:
@@ -459,6 +461,8 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_ARG1:
case AUDIT_ARG2:
case AUDIT_ARG3:
+ case AUDIT_OBJ_UID:
+ case AUDIT_OBJ_GID:
break;
case AUDIT_ARCH:
entry->rule.arch_f = f;
@@ -522,7 +526,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
goto exit_free;
break;
case AUDIT_FILTERKEY:
- err = -EINVAL;
if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
goto exit_free;
str = audit_unpack_string(&bufp, &remain, f->val);
@@ -536,7 +539,11 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
goto exit_free;
break;
case AUDIT_FILETYPE:
- if ((f->val & ~S_IFMT) > S_IFMT)
+ if (f->val & ~S_IFMT)
+ goto exit_free;
+ break;
+ case AUDIT_FIELD_COMPARE:
+ if (f->val > AUDIT_MAX_FIELD_COMPARE)
goto exit_free;
break;
default:
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 47b7fc1ea89..af1de0f34ea 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -70,9 +70,15 @@
#include "audit.h"
+/* flags stating the success for a syscall */
+#define AUDITSC_INVALID 0
+#define AUDITSC_SUCCESS 1
+#define AUDITSC_FAILURE 2
+
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
- * for saving names from getname(). */
-#define AUDIT_NAMES 20
+ * for saving names from getname(). If we get more names we will allocate
+ * a name dynamically and also add those to the list anchored by names_list. */
+#define AUDIT_NAMES 5
/* Indicates that audit should log the full pathname. */
#define AUDIT_NAME_FULL -1
@@ -101,9 +107,8 @@ struct audit_cap_data {
*
* Further, in fs/namei.c:path_lookup() we store the inode and device. */
struct audit_names {
+ struct list_head list; /* audit_context->names_list */
const char *name;
- int name_len; /* number of name's characters to log */
- unsigned name_put; /* call __putname() for this name */
unsigned long ino;
dev_t dev;
umode_t mode;
@@ -113,6 +118,14 @@ struct audit_names {
u32 osid;
struct audit_cap_data fcap;
unsigned int fcap_ver;
+ int name_len; /* number of name's characters to log */
+ bool name_put; /* call __putname() for this name */
+ /*
+ * This was an allocated audit_names and not from the array of
+ * names allocated in the task audit context. Thus this name
+ * should be freed on syscall exit
+ */
+ bool should_free;
};
struct audit_aux_data {
@@ -174,8 +187,17 @@ struct audit_context {
long return_code;/* syscall return code */
u64 prio;
int return_valid; /* return code is valid */
- int name_count;
- struct audit_names names[AUDIT_NAMES];
+ /*
+ * The names_list is the list of all audit_names collected during this
+ * syscall. The first AUDIT_NAMES entries in the names_list will
+ * actually be from the preallocated_names array for performance
+ * reasons. Except during allocation they should never be referenced
+ * through the preallocated_names array and should only be found/used
+ * by running the names_list.
+ */
+ struct audit_names preallocated_names[AUDIT_NAMES];
+ int name_count; /* total records in names_list */
+ struct list_head names_list; /* anchor for struct audit_names->list */
char * filterkey; /* key for rule that triggered record */
struct path pwd;
struct audit_context *previous; /* For nested syscalls */
@@ -210,12 +232,12 @@ struct audit_context {
struct {
uid_t uid;
gid_t gid;
- mode_t mode;
+ umode_t mode;
u32 osid;
int has_perm;
uid_t perm_uid;
gid_t perm_gid;
- mode_t perm_mode;
+ umode_t perm_mode;
unsigned long qbytes;
} ipc;
struct {
@@ -234,7 +256,7 @@ struct audit_context {
} mq_sendrecv;
struct {
int oflag;
- mode_t mode;
+ umode_t mode;
struct mq_attr attr;
} mq_open;
struct {
@@ -305,21 +327,21 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
}
}
-static int audit_match_filetype(struct audit_context *ctx, int which)
+static int audit_match_filetype(struct audit_context *ctx, int val)
{
- unsigned index = which & ~S_IFMT;
- mode_t mode = which & S_IFMT;
+ struct audit_names *n;
+ umode_t mode = (umode_t)val;
if (unlikely(!ctx))
return 0;
- if (index >= ctx->name_count)
- return 0;
- if (ctx->names[index].ino == -1)
- return 0;
- if ((ctx->names[index].mode ^ mode) & S_IFMT)
- return 0;
- return 1;
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if ((n->ino != -1) &&
+ ((n->mode & S_IFMT) == mode))
+ return 1;
+ }
+
+ return 0;
}
/*
@@ -441,6 +463,134 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
return 0;
}
+static int audit_compare_id(uid_t uid1,
+ struct audit_names *name,
+ unsigned long name_offset,
+ struct audit_field *f,
+ struct audit_context *ctx)
+{
+ struct audit_names *n;
+ unsigned long addr;
+ uid_t uid2;
+ int rc;
+
+ BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t));
+
+ if (name) {
+ addr = (unsigned long)name;
+ addr += name_offset;
+
+ uid2 = *(uid_t *)addr;
+ rc = audit_comparator(uid1, f->op, uid2);
+ if (rc)
+ return rc;
+ }
+
+ if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ addr = (unsigned long)n;
+ addr += name_offset;
+
+ uid2 = *(uid_t *)addr;
+
+ rc = audit_comparator(uid1, f->op, uid2);
+ if (rc)
+ return rc;
+ }
+ }
+ return 0;
+}
+
+static int audit_field_compare(struct task_struct *tsk,
+ const struct cred *cred,
+ struct audit_field *f,
+ struct audit_context *ctx,
+ struct audit_names *name)
+{
+ switch (f->val) {
+ /* process to file object comparisons */
+ case AUDIT_COMPARE_UID_TO_OBJ_UID:
+ return audit_compare_id(cred->uid,
+ name, offsetof(struct audit_names, uid),
+ f, ctx);
+ case AUDIT_COMPARE_GID_TO_OBJ_GID:
+ return audit_compare_id(cred->gid,
+ name, offsetof(struct audit_names, gid),
+ f, ctx);
+ case AUDIT_COMPARE_EUID_TO_OBJ_UID:
+ return audit_compare_id(cred->euid,
+ name, offsetof(struct audit_names, uid),
+ f, ctx);
+ case AUDIT_COMPARE_EGID_TO_OBJ_GID:
+ return audit_compare_id(cred->egid,
+ name, offsetof(struct audit_names, gid),
+ f, ctx);
+ case AUDIT_COMPARE_AUID_TO_OBJ_UID:
+ return audit_compare_id(tsk->loginuid,
+ name, offsetof(struct audit_names, uid),
+ f, ctx);
+ case AUDIT_COMPARE_SUID_TO_OBJ_UID:
+ return audit_compare_id(cred->suid,
+ name, offsetof(struct audit_names, uid),
+ f, ctx);
+ case AUDIT_COMPARE_SGID_TO_OBJ_GID:
+ return audit_compare_id(cred->sgid,
+ name, offsetof(struct audit_names, gid),
+ f, ctx);
+ case AUDIT_COMPARE_FSUID_TO_OBJ_UID:
+ return audit_compare_id(cred->fsuid,
+ name, offsetof(struct audit_names, uid),
+ f, ctx);
+ case AUDIT_COMPARE_FSGID_TO_OBJ_GID:
+ return audit_compare_id(cred->fsgid,
+ name, offsetof(struct audit_names, gid),
+ f, ctx);
+ /* uid comparisons */
+ case AUDIT_COMPARE_UID_TO_AUID:
+ return audit_comparator(cred->uid, f->op, tsk->loginuid);
+ case AUDIT_COMPARE_UID_TO_EUID:
+ return audit_comparator(cred->uid, f->op, cred->euid);
+ case AUDIT_COMPARE_UID_TO_SUID:
+ return audit_comparator(cred->uid, f->op, cred->suid);
+ case AUDIT_COMPARE_UID_TO_FSUID:
+ return audit_comparator(cred->uid, f->op, cred->fsuid);
+ /* auid comparisons */
+ case AUDIT_COMPARE_AUID_TO_EUID:
+ return audit_comparator(tsk->loginuid, f->op, cred->euid);
+ case AUDIT_COMPARE_AUID_TO_SUID:
+ return audit_comparator(tsk->loginuid, f->op, cred->suid);
+ case AUDIT_COMPARE_AUID_TO_FSUID:
+ return audit_comparator(tsk->loginuid, f->op, cred->fsuid);
+ /* euid comparisons */
+ case AUDIT_COMPARE_EUID_TO_SUID:
+ return audit_comparator(cred->euid, f->op, cred->suid);
+ case AUDIT_COMPARE_EUID_TO_FSUID:
+ return audit_comparator(cred->euid, f->op, cred->fsuid);
+ /* suid comparisons */
+ case AUDIT_COMPARE_SUID_TO_FSUID:
+ return audit_comparator(cred->suid, f->op, cred->fsuid);
+ /* gid comparisons */
+ case AUDIT_COMPARE_GID_TO_EGID:
+ return audit_comparator(cred->gid, f->op, cred->egid);
+ case AUDIT_COMPARE_GID_TO_SGID:
+ return audit_comparator(cred->gid, f->op, cred->sgid);
+ case AUDIT_COMPARE_GID_TO_FSGID:
+ return audit_comparator(cred->gid, f->op, cred->fsgid);
+ /* egid comparisons */
+ case AUDIT_COMPARE_EGID_TO_SGID:
+ return audit_comparator(cred->egid, f->op, cred->sgid);
+ case AUDIT_COMPARE_EGID_TO_FSGID:
+ return audit_comparator(cred->egid, f->op, cred->fsgid);
+ /* sgid comparison */
+ case AUDIT_COMPARE_SGID_TO_FSGID:
+ return audit_comparator(cred->sgid, f->op, cred->fsgid);
+ default:
+ WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n");
+ return 0;
+ }
+ return 0;
+}
+
/* Determine if any context name data matches a rule's watch data */
/* Compare a task_struct with an audit_rule. Return 1 on match, 0
* otherwise.
@@ -457,13 +607,14 @@ static int audit_filter_rules(struct task_struct *tsk,
bool task_creation)
{
const struct cred *cred;
- int i, j, need_sid = 1;
+ int i, need_sid = 1;
u32 sid;
cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
+ struct audit_names *n;
int result = 0;
switch (f->type) {
@@ -522,12 +673,14 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_DEVMAJOR:
- if (name)
- result = audit_comparator(MAJOR(name->dev),
- f->op, f->val);
- else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) {
+ if (name) {
+ if (audit_comparator(MAJOR(name->dev), f->op, f->val) ||
+ audit_comparator(MAJOR(name->rdev), f->op, f->val))
+ ++result;
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(MAJOR(n->dev), f->op, f->val) ||
+ audit_comparator(MAJOR(n->rdev), f->op, f->val)) {
++result;
break;
}
@@ -535,12 +688,14 @@ static int audit_filter_rules(struct task_struct *tsk,
}
break;
case AUDIT_DEVMINOR:
- if (name)
- result = audit_comparator(MINOR(name->dev),
- f->op, f->val);
- else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
+ if (name) {
+ if (audit_comparator(MINOR(name->dev), f->op, f->val) ||
+ audit_comparator(MINOR(name->rdev), f->op, f->val))
+ ++result;
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(MINOR(n->dev), f->op, f->val) ||
+ audit_comparator(MINOR(n->rdev), f->op, f->val)) {
++result;
break;
}
@@ -551,8 +706,32 @@ static int audit_filter_rules(struct task_struct *tsk,
if (name)
result = (name->ino == f->val);
else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(n->ino, f->op, f->val)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_OBJ_UID:
+ if (name) {
+ result = audit_comparator(name->uid, f->op, f->val);
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(n->uid, f->op, f->val)) {
+ ++result;
+ break;
+ }
+ }
+ }
+ break;
+ case AUDIT_OBJ_GID:
+ if (name) {
+ result = audit_comparator(name->gid, f->op, f->val);
+ } else if (ctx) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_comparator(n->gid, f->op, f->val)) {
++result;
break;
}
@@ -607,11 +786,10 @@ static int audit_filter_rules(struct task_struct *tsk,
name->osid, f->type, f->op,
f->lsm_rule, ctx);
} else if (ctx) {
- for (j = 0; j < ctx->name_count; j++) {
- if (security_audit_rule_match(
- ctx->names[j].osid,
- f->type, f->op,
- f->lsm_rule, ctx)) {
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (security_audit_rule_match(n->osid, f->type,
+ f->op, f->lsm_rule,
+ ctx)) {
++result;
break;
}
@@ -643,8 +821,10 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_FILETYPE:
result = audit_match_filetype(ctx, f->val);
break;
+ case AUDIT_FIELD_COMPARE:
+ result = audit_field_compare(tsk, cred, f, ctx, name);
+ break;
}
-
if (!result)
return 0;
}
@@ -722,40 +902,53 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
return AUDIT_BUILD_CONTEXT;
}
-/* At syscall exit time, this filter is called if any audit_names[] have been
+/*
+ * Given an audit_name check the inode hash table to see if they match.
+ * Called holding the rcu read lock to protect the use of audit_inode_hash
+ */
+static int audit_filter_inode_name(struct task_struct *tsk,
+ struct audit_names *n,
+ struct audit_context *ctx) {
+ int word, bit;
+ int h = audit_hash_ino((u32)n->ino);
+ struct list_head *list = &audit_inode_hash[h];
+ struct audit_entry *e;
+ enum audit_state state;
+
+ word = AUDIT_WORD(ctx->major);
+ bit = AUDIT_BIT(ctx->major);
+
+ if (list_empty(list))
+ return 0;
+
+ list_for_each_entry_rcu(e, list, list) {
+ if ((e->rule.mask[word] & bit) == bit &&
+ audit_filter_rules(tsk, &e->rule, ctx, n, &state, false)) {
+ ctx->current_state = state;
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/* At syscall exit time, this filter is called if any audit_names have been
* collected during syscall processing. We only check rules in sublists at hash
- * buckets applicable to the inode numbers in audit_names[].
+ * buckets applicable to the inode numbers in audit_names.
* Regarding audit_state, same rules apply as for audit_filter_syscall().
*/
void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
{
- int i;
- struct audit_entry *e;
- enum audit_state state;
+ struct audit_names *n;
if (audit_pid && tsk->tgid == audit_pid)
return;
rcu_read_lock();
- for (i = 0; i < ctx->name_count; i++) {
- int word = AUDIT_WORD(ctx->major);
- int bit = AUDIT_BIT(ctx->major);
- struct audit_names *n = &ctx->names[i];
- int h = audit_hash_ino((u32)n->ino);
- struct list_head *list = &audit_inode_hash[h];
-
- if (list_empty(list))
- continue;
- list_for_each_entry_rcu(e, list, list) {
- if ((e->rule.mask[word] & bit) == bit &&
- audit_filter_rules(tsk, &e->rule, ctx, n,
- &state, false)) {
- rcu_read_unlock();
- ctx->current_state = state;
- return;
- }
- }
+ list_for_each_entry(n, &ctx->names_list, list) {
+ if (audit_filter_inode_name(tsk, n, ctx))
+ break;
}
rcu_read_unlock();
}
@@ -766,7 +959,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
{
struct audit_context *context = tsk->audit_context;
- if (likely(!context))
+ if (!context)
return NULL;
context->return_valid = return_valid;
@@ -799,7 +992,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
static inline void audit_free_names(struct audit_context *context)
{
- int i;
+ struct audit_names *n, *next;
#if AUDIT_DEBUG == 2
if (context->put_count + context->ino_count != context->name_count) {
@@ -810,10 +1003,9 @@ static inline void audit_free_names(struct audit_context *context)
context->serial, context->major, context->in_syscall,
context->name_count, context->put_count,
context->ino_count);
- for (i = 0; i < context->name_count; i++) {
+ list_for_each_entry(n, &context->names_list, list) {
printk(KERN_ERR "names[%d] = %p = %s\n", i,
- context->names[i].name,
- context->names[i].name ?: "(null)");
+ n->name, n->name ?: "(null)");
}
dump_stack();
return;
@@ -824,9 +1016,12 @@ static inline void audit_free_names(struct audit_context *context)
context->ino_count = 0;
#endif
- for (i = 0; i < context->name_count; i++) {
- if (context->names[i].name && context->names[i].name_put)
- __putname(context->names[i].name);
+ list_for_each_entry_safe(n, next, &context->names_list, list) {
+ list_del(&n->list);
+ if (n->name && n->name_put)
+ __putname(n->name);
+ if (n->should_free)
+ kfree(n);
}
context->name_count = 0;
path_put(&context->pwd);
@@ -864,6 +1059,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
return NULL;
audit_zero_context(context, state);
INIT_LIST_HEAD(&context->killed_trees);
+ INIT_LIST_HEAD(&context->names_list);
return context;
}
@@ -886,7 +1082,7 @@ int audit_alloc(struct task_struct *tsk)
return 0; /* Return if not auditing. */
state = audit_filter_task(tsk, &key);
- if (likely(state == AUDIT_DISABLED))
+ if (state == AUDIT_DISABLED)
return 0;
if (!(context = audit_alloc_context(state))) {
@@ -975,7 +1171,7 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
while (vma) {
if ((vma->vm_flags & VM_EXECUTABLE) &&
vma->vm_file) {
- audit_log_d_path(ab, "exe=",
+ audit_log_d_path(ab, " exe=",
&vma->vm_file->f_path);
break;
}
@@ -1166,8 +1362,8 @@ static void audit_log_execve_info(struct audit_context *context,
struct audit_buffer **ab,
struct audit_aux_data_execve *axi)
{
- int i;
- size_t len, len_sent = 0;
+ int i, len;
+ size_t len_sent = 0;
const char __user *p;
char *buf;
@@ -1249,7 +1445,7 @@ static void show_special(struct audit_context *context, int *call_panic)
case AUDIT_IPC: {
u32 osid = context->ipc.osid;
- audit_log_format(ab, "ouid=%u ogid=%u mode=%#o",
+ audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho",
context->ipc.uid, context->ipc.gid, context->ipc.mode);
if (osid) {
char *ctx = NULL;
@@ -1267,7 +1463,7 @@ static void show_special(struct audit_context *context, int *call_panic)
ab = audit_log_start(context, GFP_KERNEL,
AUDIT_IPC_SET_PERM);
audit_log_format(ab,
- "qbytes=%lx ouid=%u ogid=%u mode=%#o",
+ "qbytes=%lx ouid=%u ogid=%u mode=%#ho",
context->ipc.qbytes,
context->ipc.perm_uid,
context->ipc.perm_gid,
@@ -1278,7 +1474,7 @@ static void show_special(struct audit_context *context, int *call_panic)
break; }
case AUDIT_MQ_OPEN: {
audit_log_format(ab,
- "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
+ "oflag=0x%x mode=%#ho mq_flags=0x%lx mq_maxmsg=%ld "
"mq_msgsize=%ld mq_curmsgs=%ld",
context->mq_open.oflag, context->mq_open.mode,
context->mq_open.attr.mq_flags,
@@ -1324,6 +1520,68 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_end(ab);
}
+static void audit_log_name(struct audit_context *context, struct audit_names *n,
+ int record_num, int *call_panic)
+{
+ struct audit_buffer *ab;
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+ if (!ab)
+ return; /* audit_panic has been called */
+
+ audit_log_format(ab, "item=%d", record_num);
+
+ if (n->name) {
+ switch (n->name_len) {
+ case AUDIT_NAME_FULL:
+ /* log the full path */
+ audit_log_format(ab, " name=");
+ audit_log_untrustedstring(ab, n->name);
+ break;
+ case 0:
+ /* name was specified as a relative path and the
+ * directory component is the cwd */
+ audit_log_d_path(ab, " name=", &context->pwd);
+ break;
+ default:
+ /* log the name's directory component */
+ audit_log_format(ab, " name=");
+ audit_log_n_untrustedstring(ab, n->name,
+ n->name_len);
+ }
+ } else
+ audit_log_format(ab, " name=(null)");
+
+ if (n->ino != (unsigned long)-1) {
+ audit_log_format(ab, " inode=%lu"
+ " dev=%02x:%02x mode=%#ho"
+ " ouid=%u ogid=%u rdev=%02x:%02x",
+ n->ino,
+ MAJOR(n->dev),
+ MINOR(n->dev),
+ n->mode,
+ n->uid,
+ n->gid,
+ MAJOR(n->rdev),
+ MINOR(n->rdev));
+ }
+ if (n->osid != 0) {
+ char *ctx = NULL;
+ u32 len;
+ if (security_secid_to_secctx(
+ n->osid, &ctx, &len)) {
+ audit_log_format(ab, " osid=%u", n->osid);
+ *call_panic = 2;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+
+ audit_log_fcaps(ab, n);
+
+ audit_log_end(ab);
+}
+
static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
{
const struct cred *cred;
@@ -1331,6 +1589,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
struct audit_buffer *ab;
struct audit_aux_data *aux;
const char *tty;
+ struct audit_names *n;
/* tsk == current */
context->pid = tsk->pid;
@@ -1466,70 +1725,14 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
if (context->pwd.dentry && context->pwd.mnt) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
if (ab) {
- audit_log_d_path(ab, "cwd=", &context->pwd);
+ audit_log_d_path(ab, " cwd=", &context->pwd);
audit_log_end(ab);
}
}
- for (i = 0; i < context->name_count; i++) {
- struct audit_names *n = &context->names[i];
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
- if (!ab)
- continue; /* audit_panic has been called */
-
- audit_log_format(ab, "item=%d", i);
-
- if (n->name) {
- switch(n->name_len) {
- case AUDIT_NAME_FULL:
- /* log the full path */
- audit_log_format(ab, " name=");
- audit_log_untrustedstring(ab, n->name);
- break;
- case 0:
- /* name was specified as a relative path and the
- * directory component is the cwd */
- audit_log_d_path(ab, "name=", &context->pwd);
- break;
- default:
- /* log the name's directory component */
- audit_log_format(ab, " name=");
- audit_log_n_untrustedstring(ab, n->name,
- n->name_len);
- }
- } else
- audit_log_format(ab, " name=(null)");
-
- if (n->ino != (unsigned long)-1) {
- audit_log_format(ab, " inode=%lu"
- " dev=%02x:%02x mode=%#o"
- " ouid=%u ogid=%u rdev=%02x:%02x",
- n->ino,
- MAJOR(n->dev),
- MINOR(n->dev),
- n->mode,
- n->uid,
- n->gid,
- MAJOR(n->rdev),
- MINOR(n->rdev));
- }
- if (n->osid != 0) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(
- n->osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", n->osid);
- call_panic = 2;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
-
- audit_log_fcaps(ab, n);
-
- audit_log_end(ab);
- }
+ i = 0;
+ list_for_each_entry(n, &context->names_list, list)
+ audit_log_name(context, n, i++, &call_panic);
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1545,12 +1748,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
*
* Called from copy_process and do_exit
*/
-void audit_free(struct task_struct *tsk)
+void __audit_free(struct task_struct *tsk)
{
struct audit_context *context;
context = audit_get_context(tsk, 0, 0);
- if (likely(!context))
+ if (!context)
return;
/* Check for system calls that do not go through the exit
@@ -1583,7 +1786,7 @@ void audit_free(struct task_struct *tsk)
* will only be written if another part of the kernel requests that it
* be written).
*/
-void audit_syscall_entry(int arch, int major,
+void __audit_syscall_entry(int arch, int major,
unsigned long a1, unsigned long a2,
unsigned long a3, unsigned long a4)
{
@@ -1591,7 +1794,7 @@ void audit_syscall_entry(int arch, int major,
struct audit_context *context = tsk->audit_context;
enum audit_state state;
- if (unlikely(!context))
+ if (!context)
return;
/*
@@ -1648,7 +1851,7 @@ void audit_syscall_entry(int arch, int major,
context->prio = 0;
state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
}
- if (likely(state == AUDIT_DISABLED))
+ if (state == AUDIT_DISABLED)
return;
context->serial = 0;
@@ -1658,45 +1861,29 @@ void audit_syscall_entry(int arch, int major,
context->ppid = 0;
}
-void audit_finish_fork(struct task_struct *child)
-{
- struct audit_context *ctx = current->audit_context;
- struct audit_context *p = child->audit_context;
- if (!p || !ctx)
- return;
- if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT)
- return;
- p->arch = ctx->arch;
- p->major = ctx->major;
- memcpy(p->argv, ctx->argv, sizeof(ctx->argv));
- p->ctime = ctx->ctime;
- p->dummy = ctx->dummy;
- p->in_syscall = ctx->in_syscall;
- p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL);
- p->ppid = current->pid;
- p->prio = ctx->prio;
- p->current_state = ctx->current_state;
-}
-
/**
* audit_syscall_exit - deallocate audit context after a system call
- * @valid: success/failure flag
- * @return_code: syscall return value
+ * @success: success value of the syscall
+ * @return_code: return value of the syscall
*
* Tear down after system call. If the audit context has been marked as
* auditable (either because of the AUDIT_RECORD_CONTEXT state from
- * filtering, or because some other part of the kernel write an audit
+ * filtering, or because some other part of the kernel wrote an audit
* message), then write out the syscall information. In call cases,
* free the names stored from getname().
*/
-void audit_syscall_exit(int valid, long return_code)
+void __audit_syscall_exit(int success, long return_code)
{
struct task_struct *tsk = current;
struct audit_context *context;
- context = audit_get_context(tsk, valid, return_code);
+ if (success)
+ success = AUDITSC_SUCCESS;
+ else
+ success = AUDITSC_FAILURE;
- if (likely(!context))
+ context = audit_get_context(tsk, success, return_code);
+ if (!context)
return;
if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT)
@@ -1821,6 +2008,30 @@ retry:
#endif
}
+static struct audit_names *audit_alloc_name(struct audit_context *context)
+{
+ struct audit_names *aname;
+
+ if (context->name_count < AUDIT_NAMES) {
+ aname = &context->preallocated_names[context->name_count];
+ memset(aname, 0, sizeof(*aname));
+ } else {
+ aname = kzalloc(sizeof(*aname), GFP_NOFS);
+ if (!aname)
+ return NULL;
+ aname->should_free = true;
+ }
+
+ aname->ino = (unsigned long)-1;
+ list_add_tail(&aname->list, &context->names_list);
+
+ context->name_count++;
+#if AUDIT_DEBUG
+ context->ino_count++;
+#endif
+ return aname;
+}
+
/**
* audit_getname - add a name to the list
* @name: name to add
@@ -1831,9 +2042,7 @@ retry:
void __audit_getname(const char *name)
{
struct audit_context *context = current->audit_context;
-
- if (IS_ERR(name) || !name)
- return;
+ struct audit_names *n;
if (!context->in_syscall) {
#if AUDIT_DEBUG == 2
@@ -1843,13 +2052,15 @@ void __audit_getname(const char *name)
#endif
return;
}
- BUG_ON(context->name_count >= AUDIT_NAMES);
- context->names[context->name_count].name = name;
- context->names[context->name_count].name_len = AUDIT_NAME_FULL;
- context->names[context->name_count].name_put = 1;
- context->names[context->name_count].ino = (unsigned long)-1;
- context->names[context->name_count].osid = 0;
- ++context->name_count;
+
+ n = audit_alloc_name(context);
+ if (!n)
+ return;
+
+ n->name = name;
+ n->name_len = AUDIT_NAME_FULL;
+ n->name_put = true;
+
if (!context->pwd.dentry)
get_fs_pwd(current->fs, &context->pwd);
}
@@ -1871,12 +2082,13 @@ void audit_putname(const char *name)
printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
__FILE__, __LINE__, context->serial, name);
if (context->name_count) {
+ struct audit_names *n;
int i;
- for (i = 0; i < context->name_count; i++)
+
+ list_for_each_entry(n, &context->names_list, list)
printk(KERN_ERR "name[%d] = %p = %s\n", i,
- context->names[i].name,
- context->names[i].name ?: "(null)");
- }
+ n->name, n->name ?: "(null)");
+ }
#endif
__putname(name);
}
@@ -1897,39 +2109,11 @@ void audit_putname(const char *name)
#endif
}
-static int audit_inc_name_count(struct audit_context *context,
- const struct inode *inode)
-{
- if (context->name_count >= AUDIT_NAMES) {
- if (inode)
- printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
- "dev=%02x:%02x, inode=%lu\n",
- MAJOR(inode->i_sb->s_dev),
- MINOR(inode->i_sb->s_dev),
- inode->i_ino);
-
- else
- printk(KERN_DEBUG "name_count maxed, losing inode data\n");
- return 1;
- }
- context->name_count++;
-#if AUDIT_DEBUG
- context->ino_count++;
-#endif
- return 0;
-}
-
-
static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
{
struct cpu_vfs_cap_data caps;
int rc;
- memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t));
- memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t));
- name->fcap.fE = 0;
- name->fcap_ver = 0;
-
if (!dentry)
return 0;
@@ -1969,30 +2153,25 @@ static void audit_copy_inode(struct audit_names *name, const struct dentry *dent
*/
void __audit_inode(const char *name, const struct dentry *dentry)
{
- int idx;
struct audit_context *context = current->audit_context;
const struct inode *inode = dentry->d_inode;
+ struct audit_names *n;
if (!context->in_syscall)
return;
- if (context->name_count
- && context->names[context->name_count-1].name
- && context->names[context->name_count-1].name == name)
- idx = context->name_count - 1;
- else if (context->name_count > 1
- && context->names[context->name_count-2].name
- && context->names[context->name_count-2].name == name)
- idx = context->name_count - 2;
- else {
- /* FIXME: how much do we care about inodes that have no
- * associated name? */
- if (audit_inc_name_count(context, inode))
- return;
- idx = context->name_count - 1;
- context->names[idx].name = NULL;
+
+ list_for_each_entry_reverse(n, &context->names_list, list) {
+ if (n->name && (n->name == name))
+ goto out;
}
+
+ /* unable to find the name from a previous getname() */
+ n = audit_alloc_name(context);
+ if (!n)
+ return;
+out:
handle_path(dentry);
- audit_copy_inode(&context->names[idx], dentry, inode);
+ audit_copy_inode(n, dentry, inode);
}
/**
@@ -2011,11 +2190,11 @@ void __audit_inode(const char *name, const struct dentry *dentry)
void __audit_inode_child(const struct dentry *dentry,
const struct inode *parent)
{
- int idx;
struct audit_context *context = current->audit_context;
const char *found_parent = NULL, *found_child = NULL;
const struct inode *inode = dentry->d_inode;
const char *dname = dentry->d_name.name;
+ struct audit_names *n;
int dirlen = 0;
if (!context->in_syscall)
@@ -2025,9 +2204,7 @@ void __audit_inode_child(const struct dentry *dentry,
handle_one(inode);
/* parent is more likely, look for it first */
- for (idx = 0; idx < context->name_count; idx++) {
- struct audit_names *n = &context->names[idx];
-
+ list_for_each_entry(n, &context->names_list, list) {
if (!n->name)
continue;
@@ -2040,9 +2217,7 @@ void __audit_inode_child(const struct dentry *dentry,
}
/* no matching parent, look for matching child */
- for (idx = 0; idx < context->name_count; idx++) {
- struct audit_names *n = &context->names[idx];
-
+ list_for_each_entry(n, &context->names_list, list) {
if (!n->name)
continue;
@@ -2060,34 +2235,29 @@ void __audit_inode_child(const struct dentry *dentry,
add_names:
if (!found_parent) {
- if (audit_inc_name_count(context, parent))
+ n = audit_alloc_name(context);
+ if (!n)
return;
- idx = context->name_count - 1;
- context->names[idx].name = NULL;
- audit_copy_inode(&context->names[idx], NULL, parent);
+ audit_copy_inode(n, NULL, parent);
}
if (!found_child) {
- if (audit_inc_name_count(context, inode))
+ n = audit_alloc_name(context);
+ if (!n)
return;
- idx = context->name_count - 1;
/* Re-use the name belonging to the slot for a matching parent
* directory. All names for this context are relinquished in
* audit_free_names() */
if (found_parent) {
- context->names[idx].name = found_parent;
- context->names[idx].name_len = AUDIT_NAME_FULL;
+ n->name = found_parent;
+ n->name_len = AUDIT_NAME_FULL;
/* don't call __putname() */
- context->names[idx].name_put = 0;
- } else {
- context->names[idx].name = NULL;
+ n->name_put = false;
}
if (inode)
- audit_copy_inode(&context->names[idx], NULL, inode);
- else
- context->names[idx].ino = (unsigned long)-1;
+ audit_copy_inode(n, NULL, inode);
}
}
EXPORT_SYMBOL_GPL(__audit_inode_child);
@@ -2121,19 +2291,28 @@ int auditsc_get_stamp(struct audit_context *ctx,
static atomic_t session_id = ATOMIC_INIT(0);
/**
- * audit_set_loginuid - set a task's audit_context loginuid
- * @task: task whose audit context is being modified
+ * audit_set_loginuid - set current task's audit_context loginuid
* @loginuid: loginuid value
*
* Returns 0.
*
* Called (set) from fs/proc/base.c::proc_loginuid_write().
*/
-int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
+int audit_set_loginuid(uid_t loginuid)
{
- unsigned int sessionid = atomic_inc_return(&session_id);
+ struct task_struct *task = current;
struct audit_context *context = task->audit_context;
+ unsigned int sessionid;
+
+#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
+ if (task->loginuid != -1)
+ return -EPERM;
+#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
+ if (!capable(CAP_AUDIT_CONTROL))
+ return -EPERM;
+#endif /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
+ sessionid = atomic_inc_return(&session_id);
if (context && context->in_syscall) {
struct audit_buffer *ab;
@@ -2160,7 +2339,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
* @attr: queue attributes
*
*/
-void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
+void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr)
{
struct audit_context *context = current->audit_context;
@@ -2260,7 +2439,7 @@ void __audit_ipc_obj(struct kern_ipc_perm *ipcp)
*
* Called only after audit_ipc_obj().
*/
-void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
+void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode)
{
struct audit_context *context = current->audit_context;
@@ -2271,14 +2450,11 @@ void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mod
context->ipc.has_perm = 1;
}
-int audit_bprm(struct linux_binprm *bprm)
+int __audit_bprm(struct linux_binprm *bprm)
{
struct audit_aux_data_execve *ax;
struct audit_context *context = current->audit_context;
- if (likely(!audit_enabled || !context || context->dummy))
- return 0;
-
ax = kmalloc(sizeof(*ax), GFP_KERNEL);
if (!ax)
return -ENOMEM;
@@ -2299,13 +2475,10 @@ int audit_bprm(struct linux_binprm *bprm)
* @args: args array
*
*/
-void audit_socketcall(int nargs, unsigned long *args)
+void __audit_socketcall(int nargs, unsigned long *args)
{
struct audit_context *context = current->audit_context;
- if (likely(!context || context->dummy))
- return;
-
context->type = AUDIT_SOCKETCALL;
context->socketcall.nargs = nargs;
memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
@@ -2331,13 +2504,10 @@ void __audit_fd_pair(int fd1, int fd2)
*
* Returns 0 for success or NULL context or < 0 on error.
*/
-int audit_sockaddr(int len, void *a)
+int __audit_sockaddr(int len, void *a)
{
struct audit_context *context = current->audit_context;
- if (likely(!context || context->dummy))
- return 0;
-
if (!context->sockaddr) {
void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL);
if (!p)
@@ -2499,6 +2669,25 @@ void __audit_mmap_fd(int fd, int flags)
context->type = AUDIT_MMAP;
}
+static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
+{
+ uid_t auid, uid;
+ gid_t gid;
+ unsigned int sessionid;
+
+ auid = audit_get_loginuid(current);
+ sessionid = audit_get_sessionid(current);
+ current_uid_gid(&uid, &gid);
+
+ audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
+ auid, uid, gid, sessionid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " pid=%d comm=", current->pid);
+ audit_log_untrustedstring(ab, current->comm);
+ audit_log_format(ab, " reason=");
+ audit_log_string(ab, reason);
+ audit_log_format(ab, " sig=%ld", signr);
+}
/**
* audit_core_dumps - record information about processes that end abnormally
* @signr: signal value
@@ -2509,10 +2698,6 @@ void __audit_mmap_fd(int fd, int flags)
void audit_core_dumps(long signr)
{
struct audit_buffer *ab;
- u32 sid;
- uid_t auid = audit_get_loginuid(current), uid;
- gid_t gid;
- unsigned int sessionid = audit_get_sessionid(current);
if (!audit_enabled)
return;
@@ -2521,24 +2706,17 @@ void audit_core_dumps(long signr)
return;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
- current_uid_gid(&uid, &gid);
- audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
- auid, uid, gid, sessionid);
- security_task_getsecid(current, &sid);
- if (sid) {
- char *ctx = NULL;
- u32 len;
+ audit_log_abend(ab, "memory violation", signr);
+ audit_log_end(ab);
+}
- if (security_secid_to_secctx(sid, &ctx, &len))
- audit_log_format(ab, " ssid=%u", sid);
- else {
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
- audit_log_format(ab, " pid=%d comm=", current->pid);
- audit_log_untrustedstring(ab, current->comm);
- audit_log_format(ab, " sig=%ld", signr);
+void __audit_seccomp(unsigned long syscall)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+ audit_log_abend(ab, "seccomp", SIGKILL);
+ audit_log_format(ab, " syscall=%ld", syscall);
audit_log_end(ab);
}
diff --git a/kernel/capability.c b/kernel/capability.c
index b463871a4e6..3f1adb6c647 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -287,74 +287,84 @@ error:
}
/**
- * has_capability - Does a task have a capability in init_user_ns
+ * has_ns_capability - Does a task have a capability in a specific user ns
* @t: The task in question
+ * @ns: target user namespace
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
- * currently in effect to the initial user namespace, false if not.
+ * currently in effect to the specified user namespace, false if not.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
-bool has_capability(struct task_struct *t, int cap)
+bool has_ns_capability(struct task_struct *t,
+ struct user_namespace *ns, int cap)
{
- int ret = security_real_capable(t, &init_user_ns, cap);
+ int ret;
+
+ rcu_read_lock();
+ ret = security_capable(__task_cred(t), ns, cap);
+ rcu_read_unlock();
return (ret == 0);
}
/**
- * has_capability - Does a task have a capability in a specific user ns
+ * has_capability - Does a task have a capability in init_user_ns
* @t: The task in question
- * @ns: target user namespace
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
- * currently in effect to the specified user namespace, false if not.
+ * currently in effect to the initial user namespace, false if not.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
-bool has_ns_capability(struct task_struct *t,
- struct user_namespace *ns, int cap)
+bool has_capability(struct task_struct *t, int cap)
{
- int ret = security_real_capable(t, ns, cap);
-
- return (ret == 0);
+ return has_ns_capability(t, &init_user_ns, cap);
}
/**
- * has_capability_noaudit - Does a task have a capability (unaudited)
+ * has_ns_capability_noaudit - Does a task have a capability (unaudited)
+ * in a specific user ns.
* @t: The task in question
+ * @ns: target user namespace
* @cap: The capability to be tested for
*
* Return true if the specified task has the given superior capability
- * currently in effect to init_user_ns, false if not. Don't write an
- * audit message for the check.
+ * currently in effect to the specified user namespace, false if not.
+ * Do not write an audit message for the check.
*
* Note that this does not set PF_SUPERPRIV on the task.
*/
-bool has_capability_noaudit(struct task_struct *t, int cap)
+bool has_ns_capability_noaudit(struct task_struct *t,
+ struct user_namespace *ns, int cap)
{
- int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
+ int ret;
+
+ rcu_read_lock();
+ ret = security_capable_noaudit(__task_cred(t), ns, cap);
+ rcu_read_unlock();
return (ret == 0);
}
/**
- * capable - Determine if the current task has a superior capability in effect
+ * has_capability_noaudit - Does a task have a capability (unaudited) in the
+ * initial user ns
+ * @t: The task in question
* @cap: The capability to be tested for
*
- * Return true if the current task has the given superior capability currently
- * available for use, false if not.
+ * Return true if the specified task has the given superior capability
+ * currently in effect to init_user_ns, false if not. Don't write an
+ * audit message for the check.
*
- * This sets PF_SUPERPRIV on the task if the capability is available on the
- * assumption that it's about to be used.
+ * Note that this does not set PF_SUPERPRIV on the task.
*/
-bool capable(int cap)
+bool has_capability_noaudit(struct task_struct *t, int cap)
{
- return ns_capable(&init_user_ns, cap);
+ return has_ns_capability_noaudit(t, &init_user_ns, cap);
}
-EXPORT_SYMBOL(capable);
/**
* ns_capable - Determine if the current task has a superior capability in effect
@@ -374,7 +384,7 @@ bool ns_capable(struct user_namespace *ns, int cap)
BUG();
}
- if (security_capable(ns, current_cred(), cap) == 0) {
+ if (security_capable(current_cred(), ns, cap) == 0) {
current->flags |= PF_SUPERPRIV;
return true;
}
@@ -383,18 +393,20 @@ bool ns_capable(struct user_namespace *ns, int cap)
EXPORT_SYMBOL(ns_capable);
/**
- * task_ns_capable - Determine whether current task has a superior
- * capability targeted at a specific task's user namespace.
- * @t: The task whose user namespace is targeted.
- * @cap: The capability in question.
+ * capable - Determine if the current task has a superior capability in effect
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
*
- * Return true if it does, false otherwise.
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
*/
-bool task_ns_capable(struct task_struct *t, int cap)
+bool capable(int cap)
{
- return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
+ return ns_capable(&init_user_ns, cap);
}
-EXPORT_SYMBOL(task_ns_capable);
+EXPORT_SYMBOL(capable);
/**
* nsown_capable - Check superior capability to one's own user_ns
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d9d5648f3cd..a5d3b5325f7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,7 +63,24 @@
#include <linux/atomic.h>
+/*
+ * cgroup_mutex is the master lock. Any modification to cgroup or its
+ * hierarchy must be performed while holding it.
+ *
+ * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
+ * cgroupfs_root of any cgroup hierarchy - subsys list, flags,
+ * release_agent_path and so on. Modifying requires both cgroup_mutex and
+ * cgroup_root_mutex. Readers can acquire either of the two. This is to
+ * break the following locking order cycle.
+ *
+ * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
+ * B. namespace_sem -> cgroup_mutex
+ *
+ * B happens only through cgroup_show_options() and using cgroup_root_mutex
+ * breaks it.
+ */
static DEFINE_MUTEX(cgroup_mutex);
+static DEFINE_MUTEX(cgroup_root_mutex);
/*
* Generate an array of cgroup subsystem pointers. At boot time, this is
@@ -760,7 +777,7 @@ EXPORT_SYMBOL_GPL(cgroup_unlock);
* -> cgroup_mkdir.
*/
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *);
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
static int cgroup_populate_dir(struct cgroup *cgrp);
@@ -775,7 +792,7 @@ static struct backing_dev_info cgroup_backing_dev_info = {
static int alloc_css_id(struct cgroup_subsys *ss,
struct cgroup *parent, struct cgroup *child);
-static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
+static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
{
struct inode *inode = new_inode(sb);
@@ -921,7 +938,7 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
*
* CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
*/
-DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
{
@@ -953,6 +970,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
int i;
BUG_ON(!mutex_is_locked(&cgroup_mutex));
+ BUG_ON(!mutex_is_locked(&cgroup_root_mutex));
removed_bits = root->actual_subsys_bits & ~final_bits;
added_bits = final_bits & ~root->actual_subsys_bits;
@@ -1038,12 +1056,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
return 0;
}
-static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
+static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
{
- struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
+ struct cgroupfs_root *root = dentry->d_sb->s_fs_info;
struct cgroup_subsys *ss;
- mutex_lock(&cgroup_mutex);
+ mutex_lock(&cgroup_root_mutex);
for_each_subsys(root, ss)
seq_printf(seq, ",%s", ss->name);
if (test_bit(ROOT_NOPREFIX, &root->flags))
@@ -1054,7 +1072,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
- mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&cgroup_root_mutex);
return 0;
}
@@ -1175,10 +1193,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
/*
* If the 'all' option was specified select all the subsystems,
- * otherwise 'all, 'none' and a subsystem name options were not
- * specified, let's default to 'all'
+ * otherwise if 'none', 'name=' and a subsystem name options
+ * were not specified, let's default to 'all'
*/
- if (all_ss || (!all_ss && !one_ss && !opts->none)) {
+ if (all_ss || (!one_ss && !opts->none && !opts->name)) {
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (ss == NULL)
@@ -1269,6 +1287,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
+ mutex_lock(&cgroup_root_mutex);
/* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts);
@@ -1297,6 +1316,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
out_unlock:
kfree(opts.release_agent);
kfree(opts.name);
+ mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return ret;
@@ -1481,6 +1501,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int ret = 0;
struct super_block *sb;
struct cgroupfs_root *new_root;
+ struct inode *inode;
/* First find the desired set of subsystems */
mutex_lock(&cgroup_mutex);
@@ -1514,7 +1535,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
/* We used the new root structure, so this is a new hierarchy */
struct list_head tmp_cg_links;
struct cgroup *root_cgrp = &root->top_cgroup;
- struct inode *inode;
struct cgroupfs_root *existing_root;
const struct cred *cred;
int i;
@@ -1528,18 +1548,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
mutex_lock(&inode->i_mutex);
mutex_lock(&cgroup_mutex);
+ mutex_lock(&cgroup_root_mutex);
- if (strlen(root->name)) {
- /* Check for name clashes with existing mounts */
- for_each_active_root(existing_root) {
- if (!strcmp(existing_root->name, root->name)) {
- ret = -EBUSY;
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- goto drop_new_super;
- }
- }
- }
+ /* Check for name clashes with existing mounts */
+ ret = -EBUSY;
+ if (strlen(root->name))
+ for_each_active_root(existing_root)
+ if (!strcmp(existing_root->name, root->name))
+ goto unlock_drop;
/*
* We're accessing css_set_count without locking
@@ -1549,18 +1565,13 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
* have some link structures left over
*/
ret = allocate_cg_links(css_set_count, &tmp_cg_links);
- if (ret) {
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
- goto drop_new_super;
- }
+ if (ret)
+ goto unlock_drop;
ret = rebind_subsystems(root, root->subsys_bits);
if (ret == -EBUSY) {
- mutex_unlock(&cgroup_mutex);
- mutex_unlock(&inode->i_mutex);
free_cg_links(&tmp_cg_links);
- goto drop_new_super;
+ goto unlock_drop;
}
/*
* There must be no failure case after here, since rebinding
@@ -1599,6 +1610,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
cred = override_creds(&init_cred);
cgroup_populate_dir(root_cgrp);
revert_creds(cred);
+ mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&inode->i_mutex);
} else {
@@ -1615,6 +1627,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
kfree(opts.name);
return dget(sb->s_root);
+ unlock_drop:
+ mutex_unlock(&cgroup_root_mutex);
+ mutex_unlock(&cgroup_mutex);
+ mutex_unlock(&inode->i_mutex);
drop_new_super:
deactivate_locked_super(sb);
drop_modules:
@@ -1639,6 +1655,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
BUG_ON(!list_empty(&cgrp->sibling));
mutex_lock(&cgroup_mutex);
+ mutex_lock(&cgroup_root_mutex);
/* Rebind all subsystems back to the default hierarchy */
ret = rebind_subsystems(root, 0);
@@ -1664,6 +1681,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
root_count--;
}
+ mutex_unlock(&cgroup_root_mutex);
mutex_unlock(&cgroup_mutex);
kill_litter_super(sb);
@@ -1740,11 +1758,90 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
EXPORT_SYMBOL_GPL(cgroup_path);
/*
+ * Control Group taskset
+ */
+struct task_and_cgroup {
+ struct task_struct *task;
+ struct cgroup *cgrp;
+};
+
+struct cgroup_taskset {
+ struct task_and_cgroup single;
+ struct flex_array *tc_array;
+ int tc_array_len;
+ int idx;
+ struct cgroup *cur_cgrp;
+};
+
+/**
+ * cgroup_taskset_first - reset taskset and return the first task
+ * @tset: taskset of interest
+ *
+ * @tset iteration is initialized and the first task is returned.
+ */
+struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
+{
+ if (tset->tc_array) {
+ tset->idx = 0;
+ return cgroup_taskset_next(tset);
+ } else {
+ tset->cur_cgrp = tset->single.cgrp;
+ return tset->single.task;
+ }
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_first);
+
+/**
+ * cgroup_taskset_next - iterate to the next task in taskset
+ * @tset: taskset of interest
+ *
+ * Return the next task in @tset. Iteration must have been initialized
+ * with cgroup_taskset_first().
+ */
+struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
+{
+ struct task_and_cgroup *tc;
+
+ if (!tset->tc_array || tset->idx >= tset->tc_array_len)
+ return NULL;
+
+ tc = flex_array_get(tset->tc_array, tset->idx++);
+ tset->cur_cgrp = tc->cgrp;
+ return tc->task;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_next);
+
+/**
+ * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
+ * @tset: taskset of interest
+ *
+ * Return the cgroup for the current (last returned) task of @tset. This
+ * function must be preceded by either cgroup_taskset_first() or
+ * cgroup_taskset_next().
+ */
+struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset)
+{
+ return tset->cur_cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup);
+
+/**
+ * cgroup_taskset_size - return the number of tasks in taskset
+ * @tset: taskset of interest
+ */
+int cgroup_taskset_size(struct cgroup_taskset *tset)
+{
+ return tset->tc_array ? tset->tc_array_len : 1;
+}
+EXPORT_SYMBOL_GPL(cgroup_taskset_size);
+
+
+/*
* cgroup_task_migrate - move a task from one cgroup to another.
*
* 'guarantee' is set if the caller promises that a new css_set for the task
* will already exist. If not set, this function might sleep, and can fail with
- * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked.
*/
static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
struct task_struct *tsk, bool guarantee)
@@ -1753,14 +1850,12 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
struct css_set *newcg;
/*
- * get old css_set. we need to take task_lock and refcount it, because
- * an exiting task can change its css_set to init_css_set and drop its
- * old one without taking cgroup_mutex.
+ * We are synchronized through threadgroup_lock() against PF_EXITING
+ * setting such that we can't race against cgroup_exit() changing the
+ * css_set to init_css_set and dropping the old one.
*/
- task_lock(tsk);
+ WARN_ON_ONCE(tsk->flags & PF_EXITING);
oldcg = tsk->cgroups;
- get_css_set(oldcg);
- task_unlock(tsk);
/* locate or allocate a new css_set for this task. */
if (guarantee) {
@@ -1775,20 +1870,11 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
might_sleep();
/* find_css_set will give us newcg already referenced. */
newcg = find_css_set(oldcg, cgrp);
- if (!newcg) {
- put_css_set(oldcg);
+ if (!newcg)
return -ENOMEM;
- }
}
- put_css_set(oldcg);
- /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- put_css_set(newcg);
- return -ESRCH;
- }
rcu_assign_pointer(tsk->cgroups, newcg);
task_unlock(tsk);
@@ -1814,8 +1900,8 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
* @cgrp: the cgroup the task is attaching to
* @tsk: the task to be attached
*
- * Call holding cgroup_mutex. May take task_lock of
- * the task 'tsk' during call.
+ * Call with cgroup_mutex and threadgroup locked. May take task_lock of
+ * @tsk during call.
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
@@ -1823,15 +1909,23 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroup *oldcgrp;
struct cgroupfs_root *root = cgrp->root;
+ struct cgroup_taskset tset = { };
+
+ /* @tsk either already exited or can't exit until the end */
+ if (tsk->flags & PF_EXITING)
+ return -ESRCH;
/* Nothing to do if the task is already in that cgroup */
oldcgrp = task_cgroup_from_root(tsk, root);
if (cgrp == oldcgrp)
return 0;
+ tset.single.task = tsk;
+ tset.single.cgrp = oldcgrp;
+
for_each_subsys(root, ss) {
if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, tsk);
+ retval = ss->can_attach(ss, cgrp, &tset);
if (retval) {
/*
* Remember on which subsystem the can_attach()
@@ -1843,13 +1937,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
goto out;
}
}
- if (ss->can_attach_task) {
- retval = ss->can_attach_task(cgrp, tsk);
- if (retval) {
- failed_ss = ss;
- goto out;
- }
- }
}
retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
@@ -1857,12 +1944,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
goto out;
for_each_subsys(root, ss) {
- if (ss->pre_attach)
- ss->pre_attach(cgrp);
- if (ss->attach_task)
- ss->attach_task(cgrp, tsk);
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk);
+ ss->attach(ss, cgrp, &tset);
}
synchronize_rcu();
@@ -1884,7 +1967,7 @@ out:
*/
break;
if (ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, tsk);
+ ss->cancel_attach(ss, cgrp, &tset);
}
}
return retval;
@@ -1935,23 +2018,17 @@ static bool css_set_check_fetched(struct cgroup *cgrp,
read_lock(&css_set_lock);
newcg = find_existing_css_set(cg, cgrp, template);
- if (newcg)
- get_css_set(newcg);
read_unlock(&css_set_lock);
/* doesn't exist at all? */
if (!newcg)
return false;
/* see if it's already in the list */
- list_for_each_entry(cg_entry, newcg_list, links) {
- if (cg_entry->cg == newcg) {
- put_css_set(newcg);
+ list_for_each_entry(cg_entry, newcg_list, links)
+ if (cg_entry->cg == newcg)
return true;
- }
- }
/* not found */
- put_css_set(newcg);
return false;
}
@@ -1985,21 +2062,21 @@ static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
* @cgrp: the cgroup to attach to
* @leader: the threadgroup leader task_struct of the group to be attached
*
- * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
- * take task_lock of each thread in leader's threadgroup individually in turn.
+ * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
+ * task_lock of each thread in leader's threadgroup individually in turn.
*/
-int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
{
int retval, i, group_size;
struct cgroup_subsys *ss, *failed_ss = NULL;
- bool cancel_failed_ss = false;
/* guaranteed to be initialized later, but the compiler needs this */
- struct cgroup *oldcgrp = NULL;
struct css_set *oldcg;
struct cgroupfs_root *root = cgrp->root;
/* threadgroup list cursor and array */
struct task_struct *tsk;
+ struct task_and_cgroup *tc;
struct flex_array *group;
+ struct cgroup_taskset tset = { };
/*
* we need to make sure we have css_sets for all the tasks we're
* going to move -before- we actually start moving them, so that in
@@ -2012,13 +2089,12 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
* step 0: in order to do expensive, possibly blocking operations for
* every thread, we cannot iterate the thread group list, since it needs
* rcu or tasklist locked. instead, build an array of all threads in the
- * group - threadgroup_fork_lock prevents new threads from appearing,
- * and if threads exit, this will just be an over-estimate.
+ * group - group_rwsem prevents new threads from appearing, and if
+ * threads exit, this will just be an over-estimate.
*/
group_size = get_nr_threads(leader);
/* flex_array supports very large thread-groups better than kmalloc. */
- group = flex_array_alloc(sizeof(struct task_struct *), group_size,
- GFP_KERNEL);
+ group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
if (!group)
return -ENOMEM;
/* pre-allocate to guarantee space while iterating in rcu read-side. */
@@ -2040,49 +2116,53 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
retval = -EAGAIN;
goto out_free_group_list;
}
- /* take a reference on each task in the group to go in the array. */
+
tsk = leader;
i = 0;
do {
+ struct task_and_cgroup ent;
+
+ /* @tsk either already exited or can't exit until the end */
+ if (tsk->flags & PF_EXITING)
+ continue;
+
/* as per above, nr_threads may decrease, but not increase. */
BUG_ON(i >= group_size);
- get_task_struct(tsk);
/*
* saying GFP_ATOMIC has no effect here because we did prealloc
* earlier, but it's good form to communicate our expectations.
*/
- retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+ ent.task = tsk;
+ ent.cgrp = task_cgroup_from_root(tsk, root);
+ /* nothing to do if this task is already in the cgroup */
+ if (ent.cgrp == cgrp)
+ continue;
+ retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
BUG_ON(retval != 0);
i++;
} while_each_thread(leader, tsk);
/* remember the number of threads in the array for later. */
group_size = i;
+ tset.tc_array = group;
+ tset.tc_array_len = group_size;
read_unlock(&tasklist_lock);
+ /* methods shouldn't be called if no task is actually migrating */
+ retval = 0;
+ if (!group_size)
+ goto out_free_group_list;
+
/*
* step 1: check that we can legitimately attach to the cgroup.
*/
for_each_subsys(root, ss) {
if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, leader);
+ retval = ss->can_attach(ss, cgrp, &tset);
if (retval) {
failed_ss = ss;
goto out_cancel_attach;
}
}
- /* a callback to be run on every thread in the threadgroup. */
- if (ss->can_attach_task) {
- /* run on each task in the threadgroup. */
- for (i = 0; i < group_size; i++) {
- tsk = flex_array_get_ptr(group, i);
- retval = ss->can_attach_task(cgrp, tsk);
- if (retval) {
- failed_ss = ss;
- cancel_failed_ss = true;
- goto out_cancel_attach;
- }
- }
- }
}
/*
@@ -2091,72 +2171,36 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
*/
INIT_LIST_HEAD(&newcg_list);
for (i = 0; i < group_size; i++) {
- tsk = flex_array_get_ptr(group, i);
- /* nothing to do if this task is already in the cgroup */
- oldcgrp = task_cgroup_from_root(tsk, root);
- if (cgrp == oldcgrp)
- continue;
- /* get old css_set pointer */
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- /* ignore this task if it's going away */
- task_unlock(tsk);
- continue;
- }
- oldcg = tsk->cgroups;
- get_css_set(oldcg);
- task_unlock(tsk);
- /* see if the new one for us is already in the list? */
- if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
- /* was already there, nothing to do. */
- put_css_set(oldcg);
- } else {
- /* we don't already have it. get new one. */
+ tc = flex_array_get(group, i);
+ oldcg = tc->task->cgroups;
+
+ /* if we don't already have it in the list get a new one */
+ if (!css_set_check_fetched(cgrp, tc->task, oldcg,
+ &newcg_list)) {
retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
- put_css_set(oldcg);
if (retval)
goto out_list_teardown;
}
}
/*
- * step 3: now that we're guaranteed success wrt the css_sets, proceed
- * to move all tasks to the new cgroup, calling ss->attach_task for each
- * one along the way. there are no failure cases after here, so this is
- * the commit point.
+ * step 3: now that we're guaranteed success wrt the css_sets,
+ * proceed to move all tasks to the new cgroup. There are no
+ * failure cases after here, so this is the commit point.
*/
- for_each_subsys(root, ss) {
- if (ss->pre_attach)
- ss->pre_attach(cgrp);
- }
for (i = 0; i < group_size; i++) {
- tsk = flex_array_get_ptr(group, i);
- /* leave current thread as it is if it's already there */
- oldcgrp = task_cgroup_from_root(tsk, root);
- if (cgrp == oldcgrp)
- continue;
- /* if the thread is PF_EXITING, it can just get skipped. */
- retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
- if (retval == 0) {
- /* attach each task to each subsystem */
- for_each_subsys(root, ss) {
- if (ss->attach_task)
- ss->attach_task(cgrp, tsk);
- }
- } else {
- BUG_ON(retval != -ESRCH);
- }
+ tc = flex_array_get(group, i);
+ retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true);
+ BUG_ON(retval);
}
/* nothing is sensitive to fork() after this point. */
/*
- * step 4: do expensive, non-thread-specific subsystem callbacks.
- * TODO: if ever a subsystem needs to know the oldcgrp for each task
- * being moved, this call will need to be reworked to communicate that.
+ * step 4: do subsystem attach callbacks.
*/
for_each_subsys(root, ss) {
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, leader);
+ ss->attach(ss, cgrp, &tset);
}
/*
@@ -2176,20 +2220,12 @@ out_cancel_attach:
/* same deal as in cgroup_attach_task */
if (retval) {
for_each_subsys(root, ss) {
- if (ss == failed_ss) {
- if (cancel_failed_ss && ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, leader);
+ if (ss == failed_ss)
break;
- }
if (ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, leader);
+ ss->cancel_attach(ss, cgrp, &tset);
}
}
- /* clean up the array of referenced threads in the group. */
- for (i = 0; i < group_size; i++) {
- tsk = flex_array_get_ptr(group, i);
- put_task_struct(tsk);
- }
out_free_group_list:
flex_array_free(group);
return retval;
@@ -2197,8 +2233,8 @@ out_free_group_list:
/*
* Find the task_struct of the task to attach by vpid and pass it along to the
- * function to attach either it or all tasks in its threadgroup. Will take
- * cgroup_mutex; may take task_lock of task.
+ * function to attach either it or all tasks in its threadgroup. Will lock
+ * cgroup_mutex and threadgroup; may take task_lock of task.
*/
static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
{
@@ -2225,13 +2261,7 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
* detect it later.
*/
tsk = tsk->group_leader;
- } else if (tsk->flags & PF_EXITING) {
- /* optimization for the single-task-only case */
- rcu_read_unlock();
- cgroup_unlock();
- return -ESRCH;
}
-
/*
* even if we're attaching all tasks in the thread group, we
* only need to check permissions on one of them.
@@ -2254,13 +2284,15 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
get_task_struct(tsk);
}
- if (threadgroup) {
- threadgroup_fork_write_lock(tsk);
+ threadgroup_lock(tsk);
+
+ if (threadgroup)
ret = cgroup_attach_proc(cgrp, tsk);
- threadgroup_fork_write_unlock(tsk);
- } else {
+ else
ret = cgroup_attach_task(cgrp, tsk);
- }
+
+ threadgroup_unlock(tsk);
+
put_task_struct(tsk);
cgroup_unlock();
return ret;
@@ -2311,7 +2343,9 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
return -EINVAL;
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
+ mutex_lock(&cgroup_root_mutex);
strcpy(cgrp->root->release_agent_path, buffer);
+ mutex_unlock(&cgroup_root_mutex);
cgroup_unlock();
return 0;
}
@@ -2590,7 +2624,7 @@ static inline struct cftype *__file_cft(struct file *file)
return __d_cft(file->f_dentry);
}
-static int cgroup_create_file(struct dentry *dentry, mode_t mode,
+static int cgroup_create_file(struct dentry *dentry, umode_t mode,
struct super_block *sb)
{
struct inode *inode;
@@ -2631,7 +2665,7 @@ static int cgroup_create_file(struct dentry *dentry, mode_t mode,
* @mode: mode to set on new directory.
*/
static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
- mode_t mode)
+ umode_t mode)
{
struct dentry *parent;
int error = 0;
@@ -2658,9 +2692,9 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
* returns S_IRUGO if it has only a read handler
* returns S_IWUSR if it has only a write hander
*/
-static mode_t cgroup_file_mode(const struct cftype *cft)
+static umode_t cgroup_file_mode(const struct cftype *cft)
{
- mode_t mode = 0;
+ umode_t mode = 0;
if (cft->mode)
return cft->mode;
@@ -2683,7 +2717,7 @@ int cgroup_add_file(struct cgroup *cgrp,
struct dentry *dir = cgrp->dentry;
struct dentry *dentry;
int error;
- mode_t mode;
+ umode_t mode;
char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -2794,6 +2828,7 @@ static void cgroup_enable_task_cg_lists(void)
}
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
+ __acquires(css_set_lock)
{
/*
* The first time anyone tries to iterate across a cgroup,
@@ -2833,6 +2868,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
}
void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
+ __releases(css_set_lock)
{
read_unlock(&css_set_lock);
}
@@ -3757,7 +3793,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
* Must be called with the mutex on the parent inode held
*/
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
- mode_t mode)
+ umode_t mode)
{
struct cgroup *cgrp;
struct cgroupfs_root *root = parent->root;
@@ -3851,7 +3887,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
return err;
}
-static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct cgroup *c_parent = dentry->d_parent->d_fsdata;
@@ -4496,20 +4532,31 @@ static const struct file_operations proc_cgroupstats_operations = {
*
* A pointer to the shared css_set was automatically copied in
* fork.c by dup_task_struct(). However, we ignore that copy, since
- * it was not made under the protection of RCU or cgroup_mutex, so
- * might no longer be a valid cgroup pointer. cgroup_attach_task() might
- * have already changed current->cgroups, allowing the previously
- * referenced cgroup group to be removed and freed.
+ * it was not made under the protection of RCU, cgroup_mutex or
+ * threadgroup_change_begin(), so it might no longer be a valid
+ * cgroup pointer. cgroup_attach_task() might have already changed
+ * current->cgroups, allowing the previously referenced cgroup
+ * group to be removed and freed.
+ *
+ * Outside the pointer validity we also need to process the css_set
+ * inheritance between threadgoup_change_begin() and
+ * threadgoup_change_end(), this way there is no leak in any process
+ * wide migration performed by cgroup_attach_proc() that could otherwise
+ * miss a thread because it is too early or too late in the fork stage.
*
* At the point that cgroup_fork() is called, 'current' is the parent
* task, and the passed argument 'child' points to the child task.
*/
void cgroup_fork(struct task_struct *child)
{
- task_lock(current);
+ /*
+ * We don't need to task_lock() current because current->cgroups
+ * can't be changed concurrently here. The parent obviously hasn't
+ * exited and called cgroup_exit(), and we are synchronized against
+ * cgroup migration through threadgroup_change_begin().
+ */
child->cgroups = current->cgroups;
get_css_set(child->cgroups);
- task_unlock(current);
INIT_LIST_HEAD(&child->cg_list);
}
@@ -4551,10 +4598,19 @@ void cgroup_post_fork(struct task_struct *child)
{
if (use_task_css_set_links) {
write_lock(&css_set_lock);
- task_lock(child);
- if (list_empty(&child->cg_list))
+ if (list_empty(&child->cg_list)) {
+ /*
+ * It's safe to use child->cgroups without task_lock()
+ * here because we are protected through
+ * threadgroup_change_begin() against concurrent
+ * css_set change in cgroup_task_migrate(). Also
+ * the task can't exit at that point until
+ * wake_up_new_task() is called, so we are protected
+ * against cgroup_exit() setting child->cgroup to
+ * init_css_set.
+ */
list_add(&child->cg_list, &child->cgroups->tasks);
- task_unlock(child);
+ }
write_unlock(&css_set_lock);
}
}
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 5e828a2ca8e..fc0646b78a6 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,19 +48,17 @@ static inline struct freezer *task_freezer(struct task_struct *task)
struct freezer, css);
}
-static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
+bool cgroup_freezing(struct task_struct *task)
{
- enum freezer_state state = task_freezer(task)->state;
- return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
-}
+ enum freezer_state state;
+ bool ret;
-int cgroup_freezing_or_frozen(struct task_struct *task)
-{
- int result;
- task_lock(task);
- result = __cgroup_freezing_or_frozen(task);
- task_unlock(task);
- return result;
+ rcu_read_lock();
+ state = task_freezer(task)->state;
+ ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN;
+ rcu_read_unlock();
+
+ return ret;
}
/*
@@ -102,9 +100,6 @@ struct cgroup_subsys freezer_subsys;
* freezer_can_attach():
* cgroup_mutex (held by caller of can_attach)
*
- * cgroup_freezing_or_frozen():
- * task->alloc_lock (to get task's cgroup)
- *
* freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
* freezer->lock
* sighand->siglock (if the cgroup is freezing)
@@ -130,7 +125,7 @@ struct cgroup_subsys freezer_subsys;
* write_lock css_set_lock (cgroup iterator start)
* task->alloc_lock
* read_lock css_set_lock (cgroup iterator start)
- * task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
+ * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator())
* sighand->siglock
*/
static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
@@ -150,7 +145,18 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
static void freezer_destroy(struct cgroup_subsys *ss,
struct cgroup *cgroup)
{
- kfree(cgroup_freezer(cgroup));
+ struct freezer *freezer = cgroup_freezer(cgroup);
+
+ if (freezer->state != CGROUP_THAWED)
+ atomic_dec(&system_freezing_cnt);
+ kfree(freezer);
+}
+
+/* task is frozen or will freeze immediately when next it gets woken */
+static bool is_task_frozen_enough(struct task_struct *task)
+{
+ return frozen(task) ||
+ (task_is_stopped_or_traced(task) && freezing(task));
}
/*
@@ -160,13 +166,17 @@ static void freezer_destroy(struct cgroup_subsys *ss,
*/
static int freezer_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup,
- struct task_struct *task)
+ struct cgroup_taskset *tset)
{
struct freezer *freezer;
+ struct task_struct *task;
/*
* Anything frozen can't move or be moved to/from.
*/
+ cgroup_taskset_for_each(task, new_cgroup, tset)
+ if (cgroup_freezing(task))
+ return -EBUSY;
freezer = cgroup_freezer(new_cgroup);
if (freezer->state != CGROUP_THAWED)
@@ -175,17 +185,6 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
return 0;
}
-static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-{
- rcu_read_lock();
- if (__cgroup_freezing_or_frozen(tsk)) {
- rcu_read_unlock();
- return -EBUSY;
- }
- rcu_read_unlock();
- return 0;
-}
-
static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
{
struct freezer *freezer;
@@ -213,7 +212,7 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
/* Locking avoids race with FREEZING -> THAWED transitions. */
if (freezer->state == CGROUP_FREEZING)
- freeze_task(task, true);
+ freeze_task(task);
spin_unlock_irq(&freezer->lock);
}
@@ -231,7 +230,7 @@ static void update_if_frozen(struct cgroup *cgroup,
cgroup_iter_start(cgroup, &it);
while ((task = cgroup_iter_next(cgroup, &it))) {
ntotal++;
- if (frozen(task))
+ if (freezing(task) && is_task_frozen_enough(task))
nfrozen++;
}
@@ -279,12 +278,11 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
struct task_struct *task;
unsigned int num_cant_freeze_now = 0;
- freezer->state = CGROUP_FREEZING;
cgroup_iter_start(cgroup, &it);
while ((task = cgroup_iter_next(cgroup, &it))) {
- if (!freeze_task(task, true))
+ if (!freeze_task(task))
continue;
- if (frozen(task))
+ if (is_task_frozen_enough(task))
continue;
if (!freezing(task) && !freezer_should_skip(task))
num_cant_freeze_now++;
@@ -300,12 +298,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
struct task_struct *task;
cgroup_iter_start(cgroup, &it);
- while ((task = cgroup_iter_next(cgroup, &it))) {
- thaw_process(task);
- }
+ while ((task = cgroup_iter_next(cgroup, &it)))
+ __thaw_task(task);
cgroup_iter_end(cgroup, &it);
-
- freezer->state = CGROUP_THAWED;
}
static int freezer_change_state(struct cgroup *cgroup,
@@ -319,20 +314,24 @@ static int freezer_change_state(struct cgroup *cgroup,
spin_lock_irq(&freezer->lock);
update_if_frozen(cgroup, freezer);
- if (goal_state == freezer->state)
- goto out;
switch (goal_state) {
case CGROUP_THAWED:
+ if (freezer->state != CGROUP_THAWED)
+ atomic_dec(&system_freezing_cnt);
+ freezer->state = CGROUP_THAWED;
unfreeze_cgroup(cgroup, freezer);
break;
case CGROUP_FROZEN:
+ if (freezer->state == CGROUP_THAWED)
+ atomic_inc(&system_freezing_cnt);
+ freezer->state = CGROUP_FREEZING;
retval = try_to_freeze_cgroup(cgroup, freezer);
break;
default:
BUG();
}
-out:
+
spin_unlock_irq(&freezer->lock);
return retval;
@@ -381,10 +380,5 @@ struct cgroup_subsys freezer_subsys = {
.populate = freezer_populate,
.subsys_id = freezer_subsys_id,
.can_attach = freezer_can_attach,
- .can_attach_task = freezer_can_attach_task,
- .pre_attach = NULL,
- .attach_task = NULL,
- .attach = NULL,
.fork = freezer_fork,
- .exit = NULL,
};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 563f1360947..2060c6e5702 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -178,8 +178,7 @@ static inline void check_for_tasks(int cpu)
write_lock_irq(&tasklist_lock);
for_each_process(p) {
if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
- (!cputime_eq(p->utime, cputime_zero) ||
- !cputime_eq(p->stime, cputime_zero)))
+ (p->utime || p->stime))
printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
"(state = %ld, flags = %x)\n",
p->comm, task_pid_nr(p), cpu,
@@ -380,6 +379,7 @@ out:
cpu_maps_update_done();
return err;
}
+EXPORT_SYMBOL_GPL(cpu_up);
#ifdef CONFIG_PM_SLEEP_SMP
static cpumask_var_t frozen_cpus;
@@ -470,7 +470,7 @@ out:
cpu_maps_update_done();
}
-static int alloc_frozen_cpus(void)
+static int __init alloc_frozen_cpus(void)
{
if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO))
return -ENOMEM;
@@ -543,7 +543,7 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
}
-int cpu_hotplug_pm_sync_init(void)
+static int __init cpu_hotplug_pm_sync_init(void)
{
pm_notifier(cpu_hotplug_pm_callback, 0);
return 0;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9fe58c46a42..a09ac2b9a66 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(struct task_struct *task)
struct cpuset, css);
}
+#ifdef CONFIG_NUMA
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+ return task->mempolicy;
+}
+#else
+static inline bool task_has_mempolicy(struct task_struct *task)
+{
+ return false;
+}
+#endif
+
+
/* bits in struct cpuset flags field */
typedef enum {
CS_CPU_EXCLUSIVE,
@@ -949,7 +962,7 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
static void cpuset_change_task_nodemask(struct task_struct *tsk,
nodemask_t *newmems)
{
- bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed);
+ bool need_loop;
repeat:
/*
@@ -962,6 +975,14 @@ repeat:
return;
task_lock(tsk);
+ /*
+ * Determine if a loop is necessary if another thread is doing
+ * get_mems_allowed(). If at least one node remains unchanged and
+ * tsk does not have a mempolicy, then an empty nodemask will not be
+ * possible when mems_allowed is larger than a word.
+ */
+ need_loop = task_has_mempolicy(tsk) ||
+ !nodes_intersects(*newmems, tsk->mems_allowed);
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
@@ -981,11 +1002,9 @@ repeat:
/*
* Allocation of memory is very fast, we needn't sleep when waiting
- * for the read-side. No wait is necessary, however, if at least one
- * node remains unchanged.
+ * for the read-side.
*/
- while (masks_disjoint &&
- ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
+ while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
task_unlock(tsk);
if (!task_curr(tsk))
yield();
@@ -1370,79 +1389,73 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct task_struct *tsk)
-{
- struct cpuset *cs = cgroup_cs(cont);
-
- if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
- return -ENOSPC;
-
- /*
- * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
- * cannot change their cpu affinity and isolating such threads by their
- * set of allowed nodes is unnecessary. Thus, cpusets are not
- * applicable for such threads. This prevents checking for success of
- * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
- * be changed.
- */
- if (tsk->flags & PF_THREAD_BOUND)
- return -EINVAL;
-
- return 0;
-}
-
-static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
-{
- return security_task_setscheduler(task);
-}
-
/*
* Protected by cgroup_lock. The nodemasks must be stored globally because
- * dynamically allocating them is not allowed in pre_attach, and they must
- * persist among pre_attach, attach_task, and attach.
+ * dynamically allocating them is not allowed in can_attach, and they must
+ * persist until attach.
*/
static cpumask_var_t cpus_attach;
static nodemask_t cpuset_attach_nodemask_from;
static nodemask_t cpuset_attach_nodemask_to;
-/* Set-up work for before attaching each task. */
-static void cpuset_pre_attach(struct cgroup *cont)
+/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
+static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
- struct cpuset *cs = cgroup_cs(cont);
+ struct cpuset *cs = cgroup_cs(cgrp);
+ struct task_struct *task;
+ int ret;
+ if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+ return -ENOSPC;
+
+ cgroup_taskset_for_each(task, cgrp, tset) {
+ /*
+ * Kthreads bound to specific cpus cannot be moved to a new
+ * cpuset; we cannot change their cpu affinity and
+ * isolating such threads by their set of allowed nodes is
+ * unnecessary. Thus, cpusets are not applicable for such
+ * threads. This prevents checking for success of
+ * set_cpus_allowed_ptr() on all attached tasks before
+ * cpus_allowed may be changed.
+ */
+ if (task->flags & PF_THREAD_BOUND)
+ return -EINVAL;
+ if ((ret = security_task_setscheduler(task)))
+ return ret;
+ }
+
+ /* prepare for attach */
if (cs == &top_cpuset)
cpumask_copy(cpus_attach, cpu_possible_mask);
else
guarantee_online_cpus(cs, cpus_attach);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
-}
-
-/* Per-thread attachment work. */
-static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
-{
- int err;
- struct cpuset *cs = cgroup_cs(cont);
- /*
- * can_attach beforehand should guarantee that this doesn't fail.
- * TODO: have a better way to handle failure here
- */
- err = set_cpus_allowed_ptr(tsk, cpus_attach);
- WARN_ON_ONCE(err);
-
- cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
- cpuset_update_task_spread_flag(cs, tsk);
+ return 0;
}
-static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct cgroup *oldcont, struct task_struct *tsk)
+static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
struct mm_struct *mm;
- struct cpuset *cs = cgroup_cs(cont);
- struct cpuset *oldcs = cgroup_cs(oldcont);
+ struct task_struct *task;
+ struct task_struct *leader = cgroup_taskset_first(tset);
+ struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset);
+ struct cpuset *cs = cgroup_cs(cgrp);
+ struct cpuset *oldcs = cgroup_cs(oldcgrp);
+
+ cgroup_taskset_for_each(task, cgrp, tset) {
+ /*
+ * can_attach beforehand should guarantee that this doesn't
+ * fail. TODO: have a better way to handle failure here
+ */
+ WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
+
+ cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
+ cpuset_update_task_spread_flag(cs, task);
+ }
/*
* Change mm, possibly for multiple threads in a threadgroup. This is
@@ -1450,7 +1463,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
*/
cpuset_attach_nodemask_from = oldcs->mems_allowed;
cpuset_attach_nodemask_to = cs->mems_allowed;
- mm = get_task_mm(tsk);
+ mm = get_task_mm(leader);
if (mm) {
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
if (is_memory_migrate(cs))
@@ -1906,9 +1919,6 @@ struct cgroup_subsys cpuset_subsys = {
.create = cpuset_create,
.destroy = cpuset_destroy,
.can_attach = cpuset_can_attach,
- .can_attach_task = cpuset_can_attach_task,
- .pre_attach = cpuset_pre_attach,
- .attach_task = cpuset_attach_task,
.attach = cpuset_attach,
.populate = cpuset_populate,
.post_clone = cpuset_post_clone,
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 63786e71a3c..e2ae7349437 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1982,7 +1982,7 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf("%-20s%8u 0x%p ", mod->name,
mod->core_size, (void *)mod);
#ifdef CONFIG_MODULE_UNLOAD
- kdb_printf("%4d ", module_refcount(mod));
+ kdb_printf("%4ld ", module_refcount(mod));
#endif
if (mod->state == MODULE_STATE_GOING)
kdb_printf(" (Unloading)");
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 5532dd37aa8..7d6fb40d218 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p)
(p->exit_state & EXIT_ZOMBIE) ? 'Z' :
(p->exit_state & EXIT_DEAD) ? 'E' :
(p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
- if (p->pid == 0) {
+ if (is_idle_task(p)) {
/* Idle task. Is it really idle, apart from the kdb
* interrupt? */
if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 89e5e8aa4c3..22d901f9caf 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_core.o = -pg
endif
-obj-y := core.o ring_buffer.o
+obj-y := core.o ring_buffer.o callchain.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
new file mode 100644
index 00000000000..6581a040f39
--- /dev/null
+++ b/kernel/events/callchain.c
@@ -0,0 +1,189 @@
+/*
+ * Performance events callchain code, extracted from core.c:
+ *
+ * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include "internal.h"
+
+struct callchain_cpus_entries {
+ struct rcu_head rcu_head;
+ struct perf_callchain_entry *cpu_entries[0];
+};
+
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+static struct callchain_cpus_entries *callchain_cpus_entries;
+
+
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+ struct pt_regs *regs)
+{
+}
+
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+ struct pt_regs *regs)
+{
+}
+
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+ struct callchain_cpus_entries *entries;
+ int cpu;
+
+ entries = container_of(head, struct callchain_cpus_entries, rcu_head);
+
+ for_each_possible_cpu(cpu)
+ kfree(entries->cpu_entries[cpu]);
+
+ kfree(entries);
+}
+
+static void release_callchain_buffers(void)
+{
+ struct callchain_cpus_entries *entries;
+
+ entries = callchain_cpus_entries;
+ rcu_assign_pointer(callchain_cpus_entries, NULL);
+ call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+
+static int alloc_callchain_buffers(void)
+{
+ int cpu;
+ int size;
+ struct callchain_cpus_entries *entries;
+
+ /*
+ * We can't use the percpu allocation API for data that can be
+ * accessed from NMI. Use a temporary manual per cpu allocation
+ * until that gets sorted out.
+ */
+ size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
+
+ entries = kzalloc(size, GFP_KERNEL);
+ if (!entries)
+ return -ENOMEM;
+
+ size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+
+ for_each_possible_cpu(cpu) {
+ entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!entries->cpu_entries[cpu])
+ goto fail;
+ }
+
+ rcu_assign_pointer(callchain_cpus_entries, entries);
+
+ return 0;
+
+fail:
+ for_each_possible_cpu(cpu)
+ kfree(entries->cpu_entries[cpu]);
+ kfree(entries);
+
+ return -ENOMEM;
+}
+
+int get_callchain_buffers(void)
+{
+ int err = 0;
+ int count;
+
+ mutex_lock(&callchain_mutex);
+
+ count = atomic_inc_return(&nr_callchain_events);
+ if (WARN_ON_ONCE(count < 1)) {
+ err = -EINVAL;
+ goto exit;
+ }
+
+ if (count > 1) {
+ /* If the allocation failed, give up */
+ if (!callchain_cpus_entries)
+ err = -ENOMEM;
+ goto exit;
+ }
+
+ err = alloc_callchain_buffers();
+exit:
+ mutex_unlock(&callchain_mutex);
+
+ return err;
+}
+
+void put_callchain_buffers(void)
+{
+ if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+ release_callchain_buffers();
+ mutex_unlock(&callchain_mutex);
+ }
+}
+
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+ int cpu;
+ struct callchain_cpus_entries *entries;
+
+ *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+ if (*rctx == -1)
+ return NULL;
+
+ entries = rcu_dereference(callchain_cpus_entries);
+ if (!entries)
+ return NULL;
+
+ cpu = smp_processor_id();
+
+ return &entries->cpu_entries[cpu][*rctx];
+}
+
+static void
+put_callchain_entry(int rctx)
+{
+ put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+ int rctx;
+ struct perf_callchain_entry *entry;
+
+
+ entry = get_callchain_entry(&rctx);
+ if (rctx == -1)
+ return NULL;
+
+ if (!entry)
+ goto exit_put;
+
+ entry->nr = 0;
+
+ if (!user_mode(regs)) {
+ perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+ perf_callchain_kernel(entry, regs);
+ if (current->mm)
+ regs = task_pt_regs(current);
+ else
+ regs = NULL;
+ }
+
+ if (regs) {
+ perf_callchain_store(entry, PERF_CONTEXT_USER);
+ perf_callchain_user(entry, regs);
+ }
+
+exit_put:
+ put_callchain_entry(rctx);
+
+ return entry;
+}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0e8457da6f9..1b5c081d8b9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4,7 +4,7 @@
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*
* For licensing details see kernel-base/COPYING
*/
@@ -128,7 +128,7 @@ enum event_type_t {
* perf_sched_events : >0 events exist
* perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
-struct jump_label_key perf_sched_events __read_mostly;
+struct jump_label_key_deferred perf_sched_events __read_mostly;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static atomic_t nr_mmap_events __read_mostly;
@@ -185,6 +185,9 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
+static void ring_buffer_attach(struct perf_event *event,
+ struct ring_buffer *rb);
+
void __weak perf_event_print_debug(void) { }
extern __weak const char *perf_pmu_name(void)
@@ -812,7 +815,7 @@ static void update_event_times(struct perf_event *event)
* here.
*/
if (is_cgroup_event(event))
- run_end = perf_event_time(event);
+ run_end = perf_cgroup_event_time(event);
else if (ctx->is_active)
run_end = ctx->time;
else
@@ -1127,6 +1130,8 @@ event_sched_out(struct perf_event *event,
if (!is_software_event(event))
cpuctx->active_oncpu--;
ctx->nr_active--;
+ if (event->attr.freq && event->attr.sample_freq)
+ ctx->nr_freq--;
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
}
@@ -1322,6 +1327,7 @@ retry:
}
raw_spin_unlock_irq(&ctx->lock);
}
+EXPORT_SYMBOL_GPL(perf_event_disable);
static void perf_set_shadow_time(struct perf_event *event,
struct perf_event_context *ctx,
@@ -1403,6 +1409,8 @@ event_sched_in(struct perf_event *event,
if (!is_software_event(event))
cpuctx->active_oncpu++;
ctx->nr_active++;
+ if (event->attr.freq && event->attr.sample_freq)
+ ctx->nr_freq++;
if (event->attr.exclusive)
cpuctx->exclusive = 1;
@@ -1659,8 +1667,7 @@ retry:
* Note: this works for group members as well as group leaders
* since the non-leader members' sibling_lists will be empty.
*/
-static void __perf_event_mark_enabled(struct perf_event *event,
- struct perf_event_context *ctx)
+static void __perf_event_mark_enabled(struct perf_event *event)
{
struct perf_event *sub;
u64 tstamp = perf_event_time(event);
@@ -1698,7 +1705,7 @@ static int __perf_event_enable(void *info)
*/
perf_cgroup_set_timestamp(current, ctx);
- __perf_event_mark_enabled(event, ctx);
+ __perf_event_mark_enabled(event);
if (!event_filter_match(event)) {
if (is_cgroup_event(event))
@@ -1779,7 +1786,7 @@ void perf_event_enable(struct perf_event *event)
retry:
if (!ctx->is_active) {
- __perf_event_mark_enabled(event, ctx);
+ __perf_event_mark_enabled(event);
goto out;
}
@@ -1806,6 +1813,7 @@ retry:
out:
raw_spin_unlock_irq(&ctx->lock);
}
+EXPORT_SYMBOL_GPL(perf_event_enable);
int perf_event_refresh(struct perf_event *event, int refresh)
{
@@ -2171,9 +2179,10 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
*/
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- perf_event_sched_in(cpuctx, ctx, task);
+ if (ctx->nr_events)
+ cpuctx->task_ctx = ctx;
- cpuctx->task_ctx = ctx;
+ perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
perf_pmu_enable(ctx->pmu);
perf_ctx_unlock(cpuctx, ctx);
@@ -2291,7 +2300,10 @@ do { \
return div64_u64(dividend, divisor);
}
-static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
+static DEFINE_PER_CPU(int, perf_throttled_count);
+static DEFINE_PER_CPU(u64, perf_throttled_seq);
+
+static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
{
struct hw_perf_event *hwc = &event->hw;
s64 period, sample_period;
@@ -2310,19 +2322,40 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
hwc->sample_period = sample_period;
if (local64_read(&hwc->period_left) > 8*sample_period) {
- event->pmu->stop(event, PERF_EF_UPDATE);
+ if (disable)
+ event->pmu->stop(event, PERF_EF_UPDATE);
+
local64_set(&hwc->period_left, 0);
- event->pmu->start(event, PERF_EF_RELOAD);
+
+ if (disable)
+ event->pmu->start(event, PERF_EF_RELOAD);
}
}
-static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
+/*
+ * combine freq adjustment with unthrottling to avoid two passes over the
+ * events. At the same time, make sure, having freq events does not change
+ * the rate of unthrottling as that would introduce bias.
+ */
+static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
+ int needs_unthr)
{
struct perf_event *event;
struct hw_perf_event *hwc;
- u64 interrupts, now;
+ u64 now, period = TICK_NSEC;
s64 delta;
+ /*
+ * only need to iterate over all events iff:
+ * - context have events in frequency mode (needs freq adjust)
+ * - there are events to unthrottle on this cpu
+ */
+ if (!(ctx->nr_freq || needs_unthr))
+ return;
+
+ raw_spin_lock(&ctx->lock);
+ perf_pmu_disable(ctx->pmu);
+
list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
if (event->state != PERF_EVENT_STATE_ACTIVE)
continue;
@@ -2332,13 +2365,8 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
hwc = &event->hw;
- interrupts = hwc->interrupts;
- hwc->interrupts = 0;
-
- /*
- * unthrottle events on the tick
- */
- if (interrupts == MAX_INTERRUPTS) {
+ if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) {
+ hwc->interrupts = 0;
perf_log_throttle(event, 1);
event->pmu->start(event, 0);
}
@@ -2346,14 +2374,30 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
if (!event->attr.freq || !event->attr.sample_freq)
continue;
- event->pmu->read(event);
+ /*
+ * stop the event and update event->count
+ */
+ event->pmu->stop(event, PERF_EF_UPDATE);
+
now = local64_read(&event->count);
delta = now - hwc->freq_count_stamp;
hwc->freq_count_stamp = now;
+ /*
+ * restart the event
+ * reload only if value has changed
+ * we have stopped the event so tell that
+ * to perf_adjust_period() to avoid stopping it
+ * twice.
+ */
if (delta > 0)
- perf_adjust_period(event, period, delta);
+ perf_adjust_period(event, period, delta, false);
+
+ event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
}
+
+ perf_pmu_enable(ctx->pmu);
+ raw_spin_unlock(&ctx->lock);
}
/*
@@ -2376,7 +2420,6 @@ static void rotate_ctx(struct perf_event_context *ctx)
*/
static void perf_rotate_context(struct perf_cpu_context *cpuctx)
{
- u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
struct perf_event_context *ctx = NULL;
int rotate = 0, remove = 1;
@@ -2393,15 +2436,12 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
rotate = 1;
}
- perf_ctx_lock(cpuctx, cpuctx->task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
- perf_ctx_adjust_freq(&cpuctx->ctx, interval);
- if (ctx)
- perf_ctx_adjust_freq(ctx, interval);
-
if (!rotate)
goto done;
+ perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
+
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
if (ctx)
ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
@@ -2412,22 +2452,33 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
perf_event_sched_in(cpuctx, ctx, current);
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
done:
if (remove)
list_del_init(&cpuctx->rotation_list);
-
- perf_pmu_enable(cpuctx->ctx.pmu);
- perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
}
void perf_event_task_tick(void)
{
struct list_head *head = &__get_cpu_var(rotation_list);
struct perf_cpu_context *cpuctx, *tmp;
+ struct perf_event_context *ctx;
+ int throttled;
WARN_ON(!irqs_disabled());
+ __this_cpu_inc(perf_throttled_seq);
+ throttled = __this_cpu_xchg(perf_throttled_count, 0);
+
list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
+ ctx = &cpuctx->ctx;
+ perf_adjust_freq_unthr_context(ctx, throttled);
+
+ ctx = cpuctx->task_ctx;
+ if (ctx)
+ perf_adjust_freq_unthr_context(ctx, throttled);
+
if (cpuctx->jiffies_interval == 1 ||
!(jiffies % cpuctx->jiffies_interval))
perf_rotate_context(cpuctx);
@@ -2444,7 +2495,7 @@ static int event_enable_on_exec(struct perf_event *event,
if (event->state >= PERF_EVENT_STATE_INACTIVE)
return 0;
- __perf_event_mark_enabled(event, ctx);
+ __perf_event_mark_enabled(event);
return 1;
}
@@ -2476,13 +2527,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
raw_spin_lock(&ctx->lock);
task_ctx_sched_out(ctx);
- list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
- ret = event_enable_on_exec(event, ctx);
- if (ret)
- enabled = 1;
- }
-
- list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
+ list_for_each_entry(event, &ctx->event_list, event_entry) {
ret = event_enable_on_exec(event, ctx);
if (ret)
enabled = 1;
@@ -2570,215 +2615,6 @@ static u64 perf_event_read(struct perf_event *event)
}
/*
- * Callchain support
- */
-
-struct callchain_cpus_entries {
- struct rcu_head rcu_head;
- struct perf_callchain_entry *cpu_entries[0];
-};
-
-static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
-static atomic_t nr_callchain_events;
-static DEFINE_MUTEX(callchain_mutex);
-struct callchain_cpus_entries *callchain_cpus_entries;
-
-
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
- struct pt_regs *regs)
-{
-}
-
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
- struct pt_regs *regs)
-{
-}
-
-static void release_callchain_buffers_rcu(struct rcu_head *head)
-{
- struct callchain_cpus_entries *entries;
- int cpu;
-
- entries = container_of(head, struct callchain_cpus_entries, rcu_head);
-
- for_each_possible_cpu(cpu)
- kfree(entries->cpu_entries[cpu]);
-
- kfree(entries);
-}
-
-static void release_callchain_buffers(void)
-{
- struct callchain_cpus_entries *entries;
-
- entries = callchain_cpus_entries;
- rcu_assign_pointer(callchain_cpus_entries, NULL);
- call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
-}
-
-static int alloc_callchain_buffers(void)
-{
- int cpu;
- int size;
- struct callchain_cpus_entries *entries;
-
- /*
- * We can't use the percpu allocation API for data that can be
- * accessed from NMI. Use a temporary manual per cpu allocation
- * until that gets sorted out.
- */
- size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
-
- entries = kzalloc(size, GFP_KERNEL);
- if (!entries)
- return -ENOMEM;
-
- size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
-
- for_each_possible_cpu(cpu) {
- entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
- cpu_to_node(cpu));
- if (!entries->cpu_entries[cpu])
- goto fail;
- }
-
- rcu_assign_pointer(callchain_cpus_entries, entries);
-
- return 0;
-
-fail:
- for_each_possible_cpu(cpu)
- kfree(entries->cpu_entries[cpu]);
- kfree(entries);
-
- return -ENOMEM;
-}
-
-static int get_callchain_buffers(void)
-{
- int err = 0;
- int count;
-
- mutex_lock(&callchain_mutex);
-
- count = atomic_inc_return(&nr_callchain_events);
- if (WARN_ON_ONCE(count < 1)) {
- err = -EINVAL;
- goto exit;
- }
-
- if (count > 1) {
- /* If the allocation failed, give up */
- if (!callchain_cpus_entries)
- err = -ENOMEM;
- goto exit;
- }
-
- err = alloc_callchain_buffers();
- if (err)
- release_callchain_buffers();
-exit:
- mutex_unlock(&callchain_mutex);
-
- return err;
-}
-
-static void put_callchain_buffers(void)
-{
- if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
- release_callchain_buffers();
- mutex_unlock(&callchain_mutex);
- }
-}
-
-static int get_recursion_context(int *recursion)
-{
- int rctx;
-
- if (in_nmi())
- rctx = 3;
- else if (in_irq())
- rctx = 2;
- else if (in_softirq())
- rctx = 1;
- else
- rctx = 0;
-
- if (recursion[rctx])
- return -1;
-
- recursion[rctx]++;
- barrier();
-
- return rctx;
-}
-
-static inline void put_recursion_context(int *recursion, int rctx)
-{
- barrier();
- recursion[rctx]--;
-}
-
-static struct perf_callchain_entry *get_callchain_entry(int *rctx)
-{
- int cpu;
- struct callchain_cpus_entries *entries;
-
- *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
- if (*rctx == -1)
- return NULL;
-
- entries = rcu_dereference(callchain_cpus_entries);
- if (!entries)
- return NULL;
-
- cpu = smp_processor_id();
-
- return &entries->cpu_entries[cpu][*rctx];
-}
-
-static void
-put_callchain_entry(int rctx)
-{
- put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
-}
-
-static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
- int rctx;
- struct perf_callchain_entry *entry;
-
-
- entry = get_callchain_entry(&rctx);
- if (rctx == -1)
- return NULL;
-
- if (!entry)
- goto exit_put;
-
- entry->nr = 0;
-
- if (!user_mode(regs)) {
- perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
- perf_callchain_kernel(entry, regs);
- if (current->mm)
- regs = task_pt_regs(current);
- else
- regs = NULL;
- }
-
- if (regs) {
- perf_callchain_store(entry, PERF_CONTEXT_USER);
- perf_callchain_user(entry, regs);
- }
-
-exit_put:
- put_callchain_entry(rctx);
-
- return entry;
-}
-
-/*
* Initialize the perf_event context in a task_struct:
*/
static void __perf_event_init_context(struct perf_event_context *ctx)
@@ -2942,7 +2778,7 @@ static void free_event(struct perf_event *event)
if (!event->parent) {
if (event->attach_state & PERF_ATTACH_TASK)
- jump_label_dec(&perf_sched_events);
+ jump_label_dec_deferred(&perf_sched_events);
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
@@ -2953,7 +2789,7 @@ static void free_event(struct perf_event *event)
put_callchain_buffers();
if (is_cgroup_event(event)) {
atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
- jump_label_dec(&perf_sched_events);
+ jump_label_dec_deferred(&perf_sched_events);
}
}
@@ -3190,12 +3026,33 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
struct ring_buffer *rb;
unsigned int events = POLL_HUP;
+ /*
+ * Race between perf_event_set_output() and perf_poll(): perf_poll()
+ * grabs the rb reference but perf_event_set_output() overrides it.
+ * Here is the timeline for two threads T1, T2:
+ * t0: T1, rb = rcu_dereference(event->rb)
+ * t1: T2, old_rb = event->rb
+ * t2: T2, event->rb = new rb
+ * t3: T2, ring_buffer_detach(old_rb)
+ * t4: T1, ring_buffer_attach(rb1)
+ * t5: T1, poll_wait(event->waitq)
+ *
+ * To avoid this problem, we grab mmap_mutex in perf_poll()
+ * thereby ensuring that the assignment of the new ring buffer
+ * and the detachment of the old buffer appear atomic to perf_poll()
+ */
+ mutex_lock(&event->mmap_mutex);
+
rcu_read_lock();
rb = rcu_dereference(event->rb);
- if (rb)
+ if (rb) {
+ ring_buffer_attach(event, rb);
events = atomic_xchg(&rb->poll, 0);
+ }
rcu_read_unlock();
+ mutex_unlock(&event->mmap_mutex);
+
poll_wait(file, &event->waitq, wait);
return events;
@@ -3496,6 +3353,53 @@ unlock:
return ret;
}
+static void ring_buffer_attach(struct perf_event *event,
+ struct ring_buffer *rb)
+{
+ unsigned long flags;
+
+ if (!list_empty(&event->rb_entry))
+ return;
+
+ spin_lock_irqsave(&rb->event_lock, flags);
+ if (!list_empty(&event->rb_entry))
+ goto unlock;
+
+ list_add(&event->rb_entry, &rb->event_list);
+unlock:
+ spin_unlock_irqrestore(&rb->event_lock, flags);
+}
+
+static void ring_buffer_detach(struct perf_event *event,
+ struct ring_buffer *rb)
+{
+ unsigned long flags;
+
+ if (list_empty(&event->rb_entry))
+ return;
+
+ spin_lock_irqsave(&rb->event_lock, flags);
+ list_del_init(&event->rb_entry);
+ wake_up_all(&event->waitq);
+ spin_unlock_irqrestore(&rb->event_lock, flags);
+}
+
+static void ring_buffer_wakeup(struct perf_event *event)
+{
+ struct ring_buffer *rb;
+
+ rcu_read_lock();
+ rb = rcu_dereference(event->rb);
+ if (!rb)
+ goto unlock;
+
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+ wake_up_all(&event->waitq);
+
+unlock:
+ rcu_read_unlock();
+}
+
static void rb_free_rcu(struct rcu_head *rcu_head)
{
struct ring_buffer *rb;
@@ -3521,9 +3425,19 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
static void ring_buffer_put(struct ring_buffer *rb)
{
+ struct perf_event *event, *n;
+ unsigned long flags;
+
if (!atomic_dec_and_test(&rb->refcount))
return;
+ spin_lock_irqsave(&rb->event_lock, flags);
+ list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
+ list_del_init(&event->rb_entry);
+ wake_up_all(&event->waitq);
+ }
+ spin_unlock_irqrestore(&rb->event_lock, flags);
+
call_rcu(&rb->rcu_head, rb_free_rcu);
}
@@ -3546,6 +3460,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
vma->vm_mm->pinned_vm -= event->mmap_locked;
rcu_assign_pointer(event->rb, NULL);
+ ring_buffer_detach(event, rb);
mutex_unlock(&event->mmap_mutex);
ring_buffer_put(rb);
@@ -3700,7 +3615,7 @@ static const struct file_operations perf_fops = {
void perf_event_wakeup(struct perf_event *event)
{
- wake_up_all(&event->waitq);
+ ring_buffer_wakeup(event);
if (event->pending_kill) {
kill_fasync(&event->fasync, SIGIO, event->pending_kill);
@@ -4624,6 +4539,7 @@ static int __perf_event_overflow(struct perf_event *event,
{
int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
+ u64 seq;
int ret = 0;
/*
@@ -4633,14 +4549,20 @@ static int __perf_event_overflow(struct perf_event *event,
if (unlikely(!is_sampling_event(event)))
return 0;
- if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
- if (throttle) {
+ seq = __this_cpu_read(perf_throttled_seq);
+ if (seq != hwc->interrupts_seq) {
+ hwc->interrupts_seq = seq;
+ hwc->interrupts = 1;
+ } else {
+ hwc->interrupts++;
+ if (unlikely(throttle
+ && hwc->interrupts >= max_samples_per_tick)) {
+ __this_cpu_inc(perf_throttled_count);
hwc->interrupts = MAX_INTERRUPTS;
perf_log_throttle(event, 0);
ret = 1;
}
- } else
- hwc->interrupts++;
+ }
if (event->attr.freq) {
u64 now = perf_clock();
@@ -4649,7 +4571,7 @@ static int __perf_event_overflow(struct perf_event *event,
hwc->freq_time_stamp = now;
if (delta > 0 && delta < 2*TICK_NSEC)
- perf_adjust_period(event, delta, hwc->last_period);
+ perf_adjust_period(event, delta, hwc->last_period, true);
}
/*
@@ -4737,7 +4659,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
struct hw_perf_event *hwc = &event->hw;
int throttle = 0;
- data->period = event->hw.last_period;
if (!overflow)
overflow = perf_swevent_set_period(event);
@@ -4771,6 +4692,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
if (!is_sampling_event(event))
return;
+ if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
+ data->period = nr;
+ return perf_swevent_overflow(event, 1, data, regs);
+ } else
+ data->period = event->hw.last_period;
+
if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
return perf_swevent_overflow(event, 1, data, regs);
@@ -5283,7 +5210,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
regs = get_irq_regs();
if (regs && !perf_exclude_event(event, regs)) {
- if (!(event->attr.exclude_idle && current->pid == 0))
+ if (!(event->attr.exclude_idle && is_idle_task(current)))
if (perf_event_overflow(event, &data, regs))
ret = HRTIMER_NORESTART;
}
@@ -5822,6 +5749,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
INIT_LIST_HEAD(&event->group_entry);
INIT_LIST_HEAD(&event->event_entry);
INIT_LIST_HEAD(&event->sibling_list);
+ INIT_LIST_HEAD(&event->rb_entry);
+
init_waitqueue_head(&event->waitq);
init_irq_work(&event->pending, perf_pending_event);
@@ -5896,7 +5825,7 @@ done:
if (!event->parent) {
if (event->attach_state & PERF_ATTACH_TASK)
- jump_label_inc(&perf_sched_events);
+ jump_label_inc(&perf_sched_events.key);
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
@@ -6028,6 +5957,8 @@ set:
old_rb = event->rb;
rcu_assign_pointer(event->rb, rb);
+ if (old_rb)
+ ring_buffer_detach(event, old_rb);
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
@@ -6132,7 +6063,7 @@ SYSCALL_DEFINE5(perf_event_open,
* - that may need work on context switch
*/
atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
- jump_label_inc(&perf_sched_events);
+ jump_label_inc(&perf_sched_events.key);
}
/*
@@ -6978,6 +6909,9 @@ void __init perf_event_init(void)
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
+
+ /* do not patch jump label more than once per second */
+ jump_label_rate_limit(&perf_sched_events, HZ);
}
static int __init perf_event_sysfs_init(void)
@@ -7044,10 +6978,13 @@ static int __perf_cgroup_move(void *info)
return 0;
}
-static void
-perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
+static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
- task_function_call(task, __perf_cgroup_move, task);
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, cgrp, tset)
+ task_function_call(task, __perf_cgroup_move, task);
}
static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
@@ -7061,7 +6998,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
if (!(task->flags & PF_EXITING))
return;
- perf_cgroup_attach_task(cgrp, task);
+ task_function_call(task, __perf_cgroup_move, task);
}
struct cgroup_subsys perf_subsys = {
@@ -7070,6 +7007,6 @@ struct cgroup_subsys perf_subsys = {
.create = perf_cgroup_create,
.destroy = perf_cgroup_destroy,
.exit = perf_cgroup_exit,
- .attach_task = perf_cgroup_attach_task,
+ .attach = perf_cgroup_attach,
};
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index b7971d6f38b..ee706ce44aa 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -651,10 +651,10 @@ int __init init_hw_breakpoint(void)
err_alloc:
for_each_possible_cpu(err_cpu) {
- if (err_cpu == cpu)
- break;
for (i = 0; i < TYPE_MAX; i++)
kfree(per_cpu(nr_task_bp_pinned[i], cpu));
+ if (err_cpu == cpu)
+ break;
}
return -ENOMEM;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 09097dd8116..b0b107f90af 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -1,6 +1,10 @@
#ifndef _KERNEL_EVENTS_INTERNAL_H
#define _KERNEL_EVENTS_INTERNAL_H
+#include <linux/hardirq.h>
+
+/* Buffer handling */
+
#define RING_BUFFER_WRITABLE 0x01
struct ring_buffer {
@@ -22,6 +26,9 @@ struct ring_buffer {
local_t lost; /* nr records lost */
long watermark; /* wakeup watermark */
+ /* poll crap */
+ spinlock_t event_lock;
+ struct list_head event_list;
struct perf_event_mmap_page *user_page;
void *data_pages[0];
@@ -64,7 +71,7 @@ static inline int page_order(struct ring_buffer *rb)
}
#endif
-static unsigned long perf_data_size(struct ring_buffer *rb)
+static inline unsigned long perf_data_size(struct ring_buffer *rb)
{
return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
}
@@ -93,4 +100,37 @@ __output_copy(struct perf_output_handle *handle,
} while (len);
}
+/* Callchain handling */
+extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+extern int get_callchain_buffers(void);
+extern void put_callchain_buffers(void);
+
+static inline int get_recursion_context(int *recursion)
+{
+ int rctx;
+
+ if (in_nmi())
+ rctx = 3;
+ else if (in_irq())
+ rctx = 2;
+ else if (in_softirq())
+ rctx = 1;
+ else
+ rctx = 0;
+
+ if (recursion[rctx])
+ return -1;
+
+ recursion[rctx]++;
+ barrier();
+
+ return rctx;
+}
+
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+ barrier();
+ recursion[rctx]--;
+}
+
#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index a2a29205cc0..6ddaba43fb7 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -4,7 +4,7 @@
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
* Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
* Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
- * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*
* For licensing details see kernel-base/COPYING
*/
@@ -209,6 +209,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
rb->writable = 1;
atomic_set(&rb->refcount, 1);
+
+ INIT_LIST_HEAD(&rb->event_list);
+ spin_lock_init(&rb->event_lock);
}
#ifndef CONFIG_PERF_USE_VMALLOC
diff --git a/kernel/exit.c b/kernel/exit.c
index d0b7d988f87..4b4042f9bc6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,6 +51,7 @@
#include <trace/events/sched.h>
#include <linux/hw_breakpoint.h>
#include <linux/oom.h>
+#include <linux/writeback.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -121,9 +122,9 @@ static void __exit_signal(struct task_struct *tsk)
* We won't ever get here for the group leader, since it
* will have been the last reference on the signal_struct.
*/
- sig->utime = cputime_add(sig->utime, tsk->utime);
- sig->stime = cputime_add(sig->stime, tsk->stime);
- sig->gtime = cputime_add(sig->gtime, tsk->gtime);
+ sig->utime += tsk->utime;
+ sig->stime += tsk->stime;
+ sig->gtime += tsk->gtime;
sig->min_flt += tsk->min_flt;
sig->maj_flt += tsk->maj_flt;
sig->nvcsw += tsk->nvcsw;
@@ -679,8 +680,6 @@ static void exit_mm(struct task_struct * tsk)
tsk->mm = NULL;
up_read(&mm->mmap_sem);
enter_lazy_tlb(mm, current);
- /* We don't want this task to be frozen prematurely */
- clear_freeze_flag(tsk);
task_unlock(tsk);
mm_update_next_owner(mm);
mmput(mm);
@@ -888,7 +887,7 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
-NORET_TYPE void do_exit(long code)
+void do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
@@ -965,8 +964,7 @@ NORET_TYPE void do_exit(long code)
acct_collect(code, group_dead);
if (group_dead)
tty_audit_exit();
- if (unlikely(tsk->audit_context))
- audit_free(tsk);
+ audit_free(tsk);
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
@@ -1037,9 +1035,28 @@ NORET_TYPE void do_exit(long code)
validate_creds_for_do_exit(tsk);
preempt_disable();
+ if (tsk->nr_dirtied)
+ __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
+
+ /*
+ * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+ * when the following two conditions become true.
+ * - There is race condition of mmap_sem (It is acquired by
+ * exit_mm()), and
+ * - SMI occurs before setting TASK_RUNINNG.
+ * (or hypervisor of virtual machine switches to other guest)
+ * As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+ *
+ * To avoid it, we have to wait for releasing tsk->pi_lock which
+ * is held by try_to_wake_up()
+ */
+ smp_mb();
+ raw_spin_unlock_wait(&tsk->pi_lock);
+
/* causes final put_task_struct in finish_task_switch(). */
tsk->state = TASK_DEAD;
+ tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */
schedule();
BUG();
/* Avoid "noreturn function does return". */
@@ -1049,7 +1066,7 @@ NORET_TYPE void do_exit(long code)
EXPORT_SYMBOL_GPL(do_exit);
-NORET_TYPE void complete_and_exit(struct completion *comp, long code)
+void complete_and_exit(struct completion *comp, long code)
{
if (comp)
complete(comp);
@@ -1068,7 +1085,7 @@ SYSCALL_DEFINE1(exit, int, error_code)
* Take down every thread in the group. This is called by fatal signals
* as well as by sys_exit_group (below).
*/
-NORET_TYPE void
+void
do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
@@ -1255,19 +1272,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
spin_lock_irq(&p->real_parent->sighand->siglock);
psig = p->real_parent->signal;
sig = p->signal;
- psig->cutime =
- cputime_add(psig->cutime,
- cputime_add(tgutime,
- sig->cutime));
- psig->cstime =
- cputime_add(psig->cstime,
- cputime_add(tgstime,
- sig->cstime));
- psig->cgtime =
- cputime_add(psig->cgtime,
- cputime_add(p->gtime,
- cputime_add(sig->gtime,
- sig->cgtime)));
+ psig->cutime += tgutime + sig->cutime;
+ psig->cstime += tgstime + sig->cstime;
+ psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
psig->cmin_flt +=
p->min_flt + sig->min_flt + sig->cmin_flt;
psig->cmaj_flt +=
@@ -1540,8 +1547,15 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
}
/* dead body doesn't have much to contribute */
- if (p->exit_state == EXIT_DEAD)
+ if (unlikely(p->exit_state == EXIT_DEAD)) {
+ /*
+ * But do not ignore this task until the tracer does
+ * wait_task_zombie()->do_notify_parent().
+ */
+ if (likely(!ptrace) && unlikely(ptrace_reparented(p)))
+ wo->notask_error = 0;
return 0;
+ }
/* slay zombie? */
if (p->exit_state == EXIT_ZOMBIE) {
diff --git a/kernel/fork.c b/kernel/fork.c
index da4a6a10d08..e2cd3e2a5ae 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -66,6 +66,7 @@
#include <linux/user-return-notifier.h>
#include <linux/oom.h>
#include <linux/khugepaged.h>
+#include <linux/signalfd.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -76,6 +77,9 @@
#include <trace/events/sched.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/task.h>
+
/*
* Protected counters by write_lock_irq(&tasklist_lock)
*/
@@ -644,6 +648,26 @@ struct mm_struct *get_task_mm(struct task_struct *task)
}
EXPORT_SYMBOL_GPL(get_task_mm);
+struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
+{
+ struct mm_struct *mm;
+ int err;
+
+ err = mutex_lock_killable(&task->signal->cred_guard_mutex);
+ if (err)
+ return ERR_PTR(err);
+
+ mm = get_task_mm(task);
+ if (mm && mm != current->mm &&
+ !ptrace_may_access(task, mode)) {
+ mmput(mm);
+ mm = ERR_PTR(-EACCES);
+ }
+ mutex_unlock(&task->signal->cred_guard_mutex);
+
+ return mm;
+}
+
/* Please note the differences between mmput and mm_release.
* mmput is called whenever we stop holding onto a mm_struct,
* error success whatever.
@@ -870,6 +894,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
{
#ifdef CONFIG_BLOCK
struct io_context *ioc = current->io_context;
+ struct io_context *new_ioc;
if (!ioc)
return 0;
@@ -881,11 +906,12 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
if (unlikely(!tsk->io_context))
return -ENOMEM;
} else if (ioprio_valid(ioc->ioprio)) {
- tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
- if (unlikely(!tsk->io_context))
+ new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
+ if (unlikely(!new_ioc))
return -ENOMEM;
- tsk->io_context->ioprio = ioc->ioprio;
+ new_ioc->ioprio = ioc->ioprio;
+ put_io_context(new_ioc);
}
#endif
return 0;
@@ -910,8 +936,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
void __cleanup_sighand(struct sighand_struct *sighand)
{
- if (atomic_dec_and_test(&sighand->count))
+ if (atomic_dec_and_test(&sighand->count)) {
+ signalfd_cleanup(sighand);
kmem_cache_free(sighand_cachep, sighand);
+ }
}
@@ -972,7 +1000,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sched_autogroup_fork(sig);
#ifdef CONFIG_CGROUPS
- init_rwsem(&sig->threadgroup_fork_lock);
+ init_rwsem(&sig->group_rwsem);
#endif
sig->oom_adj = current->signal->oom_adj;
@@ -992,7 +1020,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
new_flags |= PF_FORKNOEXEC;
new_flags |= PF_STARTING;
p->flags = new_flags;
- clear_freeze_flag(p);
}
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
@@ -1023,8 +1050,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
*/
static void posix_cpu_timers_init(struct task_struct *tsk)
{
- tsk->cputime_expires.prof_exp = cputime_zero;
- tsk->cputime_expires.virt_exp = cputime_zero;
+ tsk->cputime_expires.prof_exp = 0;
+ tsk->cputime_expires.virt_exp = 0;
tsk->cputime_expires.sched_exp = 0;
INIT_LIST_HEAD(&tsk->cpu_timers[0]);
INIT_LIST_HEAD(&tsk->cpu_timers[1]);
@@ -1132,14 +1159,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
init_sigpending(&p->pending);
- p->utime = cputime_zero;
- p->stime = cputime_zero;
- p->gtime = cputime_zero;
- p->utimescaled = cputime_zero;
- p->stimescaled = cputime_zero;
+ p->utime = p->stime = p->gtime = 0;
+ p->utimescaled = p->stimescaled = 0;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- p->prev_utime = cputime_zero;
- p->prev_stime = cputime_zero;
+ p->prev_utime = p->prev_stime = 0;
#endif
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
@@ -1158,7 +1181,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->io_context = NULL;
p->audit_context = NULL;
if (clone_flags & CLONE_THREAD)
- threadgroup_fork_read_lock(current);
+ threadgroup_change_begin(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1296,6 +1319,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+ p->dirty_paused_when = 0;
/*
* Ok, make it visible to the rest of the system.
@@ -1373,8 +1397,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
proc_fork_connector(p);
cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)
- threadgroup_fork_read_unlock(current);
+ threadgroup_change_end(current);
perf_event_fork(p);
+
+ trace_task_newtask(p, clone_flags);
+
return p;
bad_fork_free_pid:
@@ -1408,7 +1435,7 @@ bad_fork_cleanup_policy:
bad_fork_cleanup_cgroup:
#endif
if (clone_flags & CLONE_THREAD)
- threadgroup_fork_read_unlock(current);
+ threadgroup_change_end(current);
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
@@ -1523,8 +1550,6 @@ long do_fork(unsigned long clone_flags,
init_completion(&vfork);
}
- audit_finish_fork(p);
-
/*
* We set PF_STARTING at creation in case tracing wants to
* use this to distinguish a fully live task from one that
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 7be56c53439..9815b8d1eed 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -9,101 +9,114 @@
#include <linux/export.h>
#include <linux/syscalls.h>
#include <linux/freezer.h>
+#include <linux/kthread.h>
-/*
- * freezing is complete, mark current process as frozen
+/* total number of freezing conditions in effect */
+atomic_t system_freezing_cnt = ATOMIC_INIT(0);
+EXPORT_SYMBOL(system_freezing_cnt);
+
+/* indicate whether PM freezing is in effect, protected by pm_mutex */
+bool pm_freezing;
+bool pm_nosig_freezing;
+
+/* protects freezing and frozen transitions */
+static DEFINE_SPINLOCK(freezer_lock);
+
+/**
+ * freezing_slow_path - slow path for testing whether a task needs to be frozen
+ * @p: task to be tested
+ *
+ * This function is called by freezing() if system_freezing_cnt isn't zero
+ * and tests whether @p needs to enter and stay in frozen state. Can be
+ * called under any context. The freezers are responsible for ensuring the
+ * target tasks see the updated state.
*/
-static inline void frozen_process(void)
+bool freezing_slow_path(struct task_struct *p)
{
- if (!unlikely(current->flags & PF_NOFREEZE)) {
- current->flags |= PF_FROZEN;
- smp_wmb();
- }
- clear_freeze_flag(current);
+ if (p->flags & PF_NOFREEZE)
+ return false;
+
+ if (pm_nosig_freezing || cgroup_freezing(p))
+ return true;
+
+ if (pm_freezing && !(p->flags & PF_KTHREAD))
+ return true;
+
+ return false;
}
+EXPORT_SYMBOL(freezing_slow_path);
/* Refrigerator is place where frozen processes are stored :-). */
-void refrigerator(void)
+bool __refrigerator(bool check_kthr_stop)
{
/* Hmm, should we be allowed to suspend when there are realtime
processes around? */
- long save;
+ bool was_frozen = false;
+ long save = current->state;
- task_lock(current);
- if (freezing(current)) {
- frozen_process();
- task_unlock(current);
- } else {
- task_unlock(current);
- return;
- }
- save = current->state;
pr_debug("%s entered refrigerator\n", current->comm);
- spin_lock_irq(&current->sighand->siglock);
- recalc_sigpending(); /* We sent fake signal, clean it up */
- spin_unlock_irq(&current->sighand->siglock);
-
- /* prevent accounting of that task to load */
- current->flags |= PF_FREEZING;
-
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (!frozen(current))
+
+ spin_lock_irq(&freezer_lock);
+ current->flags |= PF_FROZEN;
+ if (!freezing(current) ||
+ (check_kthr_stop && kthread_should_stop()))
+ current->flags &= ~PF_FROZEN;
+ spin_unlock_irq(&freezer_lock);
+
+ if (!(current->flags & PF_FROZEN))
break;
+ was_frozen = true;
schedule();
}
- /* Remove the accounting blocker */
- current->flags &= ~PF_FREEZING;
-
pr_debug("%s left refrigerator\n", current->comm);
- __set_current_state(save);
+
+ /*
+ * Restore saved task state before returning. The mb'd version
+ * needs to be used; otherwise, it might silently break
+ * synchronization which depends on ordered task state change.
+ */
+ set_current_state(save);
+
+ return was_frozen;
}
-EXPORT_SYMBOL(refrigerator);
+EXPORT_SYMBOL(__refrigerator);
static void fake_signal_wake_up(struct task_struct *p)
{
unsigned long flags;
- spin_lock_irqsave(&p->sighand->siglock, flags);
- signal_wake_up(p, 0);
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
+ if (lock_task_sighand(p, &flags)) {
+ signal_wake_up(p, 0);
+ unlock_task_sighand(p, &flags);
+ }
}
/**
- * freeze_task - send a freeze request to given task
- * @p: task to send the request to
- * @sig_only: if set, the request will only be sent if the task has the
- * PF_FREEZER_NOSIG flag unset
- * Return value: 'false', if @sig_only is set and the task has
- * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
+ * freeze_task - send a freeze request to given task
+ * @p: task to send the request to
+ *
+ * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE
+ * flag and either sending a fake signal to it or waking it up, depending
+ * on whether it has %PF_FREEZER_NOSIG set.
*
- * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
- * either sending a fake signal to it or waking it up, depending on whether
- * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
- * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
- * TIF_FREEZE flag will not be set.
+ * RETURNS:
+ * %false, if @p is not freezing or already frozen; %true, otherwise
*/
-bool freeze_task(struct task_struct *p, bool sig_only)
+bool freeze_task(struct task_struct *p)
{
- /*
- * We first check if the task is freezing and next if it has already
- * been frozen to avoid the race with frozen_process() which first marks
- * the task as frozen and next clears its TIF_FREEZE.
- */
- if (!freezing(p)) {
- smp_rmb();
- if (frozen(p))
- return false;
-
- if (!sig_only || should_send_signal(p))
- set_freeze_flag(p);
- else
- return false;
+ unsigned long flags;
+
+ spin_lock_irqsave(&freezer_lock, flags);
+ if (!freezing(p) || frozen(p)) {
+ spin_unlock_irqrestore(&freezer_lock, flags);
+ return false;
}
- if (should_send_signal(p)) {
+ if (!(p->flags & PF_KTHREAD)) {
fake_signal_wake_up(p);
/*
* fake_signal_wake_up() goes through p's scheduler
@@ -111,56 +124,48 @@ bool freeze_task(struct task_struct *p, bool sig_only)
* TASK_RUNNING transition can't race with task state
* testing in try_to_freeze_tasks().
*/
- } else if (sig_only) {
- return false;
} else {
wake_up_state(p, TASK_INTERRUPTIBLE);
}
+ spin_unlock_irqrestore(&freezer_lock, flags);
return true;
}
-void cancel_freezing(struct task_struct *p)
+void __thaw_task(struct task_struct *p)
{
unsigned long flags;
- if (freezing(p)) {
- pr_debug(" clean up: %s\n", p->comm);
- clear_freeze_flag(p);
- spin_lock_irqsave(&p->sighand->siglock, flags);
- recalc_sigpending_and_wake(p);
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
- }
-}
-
-static int __thaw_process(struct task_struct *p)
-{
- if (frozen(p)) {
- p->flags &= ~PF_FROZEN;
- return 1;
- }
- clear_freeze_flag(p);
- return 0;
+ /*
+ * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to
+ * be visible to @p as waking up implies wmb. Waking up inside
+ * freezer_lock also prevents wakeups from leaking outside
+ * refrigerator.
+ */
+ spin_lock_irqsave(&freezer_lock, flags);
+ if (frozen(p))
+ wake_up_process(p);
+ spin_unlock_irqrestore(&freezer_lock, flags);
}
-/*
- * Wake up a frozen process
+/**
+ * set_freezable - make %current freezable
*
- * task_lock() is needed to prevent the race with refrigerator() which may
- * occur if the freezing of tasks fails. Namely, without the lock, if the
- * freezing of tasks failed, thaw_tasks() might have run before a task in
- * refrigerator() could call frozen_process(), in which case the task would be
- * frozen and no one would thaw it.
+ * Mark %current freezable and enter refrigerator if necessary.
*/
-int thaw_process(struct task_struct *p)
+bool set_freezable(void)
{
- task_lock(p);
- if (__thaw_process(p) == 1) {
- task_unlock(p);
- wake_up_process(p);
- return 1;
- }
- task_unlock(p);
- return 0;
+ might_sleep();
+
+ /*
+ * Modify flags while holding freezer_lock. This ensures the
+ * freezer notices that we aren't frozen yet or the freezing
+ * condition is visible to try_to_freeze() below.
+ */
+ spin_lock_irq(&freezer_lock);
+ current->flags &= ~PF_NOFREEZE;
+ spin_unlock_irq(&freezer_lock);
+
+ return try_to_freeze();
}
-EXPORT_SYMBOL(thaw_process);
+EXPORT_SYMBOL(set_freezable);
diff --git a/kernel/futex.c b/kernel/futex.c
index ea87f4d2f45..1614be20173 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -314,17 +314,29 @@ again:
#endif
lock_page(page_head);
+
+ /*
+ * If page_head->mapping is NULL, then it cannot be a PageAnon
+ * page; but it might be the ZERO_PAGE or in the gate area or
+ * in a special mapping (all cases which we are happy to fail);
+ * or it may have been a good file page when get_user_pages_fast
+ * found it, but truncated or holepunched or subjected to
+ * invalidate_complete_page2 before we got the page lock (also
+ * cases which we are happy to fail). And we hold a reference,
+ * so refcount care in invalidate_complete_page's remove_mapping
+ * prevents drop_caches from setting mapping to NULL beneath us.
+ *
+ * The case we do have to guard against is when memory pressure made
+ * shmem_writepage move it from filecache to swapcache beneath us:
+ * an unlikely race, but we do need to retry for page_head->mapping.
+ */
if (!page_head->mapping) {
+ int shmem_swizzled = PageSwapCache(page_head);
unlock_page(page_head);
put_page(page_head);
- /*
- * ZERO_PAGE pages don't have a mapping. Avoid a busy loop
- * trying to find one. RW mapping would have COW'd (and thus
- * have a mapping) so this page is RO and won't ever change.
- */
- if ((page_head == ZERO_PAGE(address)))
- return -EFAULT;
- goto again;
+ if (shmem_swizzled)
+ goto again;
+ return -EFAULT;
}
/*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 422e567eecf..ae34bf51682 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -885,10 +885,13 @@ static void __remove_hrtimer(struct hrtimer *timer,
struct hrtimer_clock_base *base,
unsigned long newstate, int reprogram)
{
+ struct timerqueue_node *next_timer;
if (!(timer->state & HRTIMER_STATE_ENQUEUED))
goto out;
- if (&timer->node == timerqueue_getnext(&base->active)) {
+ next_timer = timerqueue_getnext(&base->active);
+ timerqueue_del(&base->active, &timer->node);
+ if (&timer->node == next_timer) {
#ifdef CONFIG_HIGH_RES_TIMERS
/* Reprogram the clock event device. if enabled */
if (reprogram && hrtimer_hres_active()) {
@@ -901,7 +904,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
}
#endif
}
- timerqueue_del(&base->active, &timer->node);
if (!timerqueue_getnext(&base->active))
base->cpu_base->active_bases &= ~(1 << base->index);
out:
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 8b1748d0172..2e48ec0c2e9 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -74,11 +74,17 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
/*
* Ensure the task is not frozen.
- * Also, when a freshly created task is scheduled once, changes
- * its state to TASK_UNINTERRUPTIBLE without having ever been
- * switched out once, it musn't be checked.
+ * Also, skip vfork and any other user process that freezer should skip.
*/
- if (unlikely(t->flags & PF_FROZEN || !switch_count))
+ if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP)))
+ return;
+
+ /*
+ * When a freshly created task is scheduled once, changes its state to
+ * TASK_UNINTERRUPTIBLE without having ever been switched out once, it
+ * musn't be checked.
+ */
+ if (unlikely(!switch_count))
return;
if (switch_count != t->last_switch_count) {
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 342d8f44e40..0119b9d467a 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -53,7 +53,7 @@ unsigned long probe_irq_on(void)
if (desc->irq_data.chip->irq_set_type)
desc->irq_data.chip->irq_set_type(&desc->irq_data,
IRQ_TYPE_PROBE);
- irq_startup(desc);
+ irq_startup(desc, false);
}
raw_spin_unlock_irq(&desc->lock);
}
@@ -70,7 +70,7 @@ unsigned long probe_irq_on(void)
raw_spin_lock_irq(&desc->lock);
if (!desc->action && irq_settings_can_probe(desc)) {
desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
- if (irq_startup(desc))
+ if (irq_startup(desc, false))
desc->istate |= IRQS_PENDING;
}
raw_spin_unlock_irq(&desc->lock);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f7c543a801d..fb7db75ee0c 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -157,19 +157,22 @@ static void irq_state_set_masked(struct irq_desc *desc)
irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
}
-int irq_startup(struct irq_desc *desc)
+int irq_startup(struct irq_desc *desc, bool resend)
{
+ int ret = 0;
+
irq_state_clr_disabled(desc);
desc->depth = 0;
if (desc->irq_data.chip->irq_startup) {
- int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
+ ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
irq_state_clr_masked(desc);
- return ret;
+ } else {
+ irq_enable(desc);
}
-
- irq_enable(desc);
- return 0;
+ if (resend)
+ check_irq_resend(desc, desc->irq_data.irq);
+ return ret;
}
void irq_shutdown(struct irq_desc *desc)
@@ -330,6 +333,24 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(handle_simple_irq);
+/*
+ * Called unconditionally from handle_level_irq() and only for oneshot
+ * interrupts from handle_fasteoi_irq()
+ */
+static void cond_unmask_irq(struct irq_desc *desc)
+{
+ /*
+ * We need to unmask in the following cases:
+ * - Standard level irq (IRQF_ONESHOT is not set)
+ * - Oneshot irq which did not wake the thread (caused by a
+ * spurious interrupt or a primary handler handling it
+ * completely).
+ */
+ if (!irqd_irq_disabled(&desc->irq_data) &&
+ irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot)
+ unmask_irq(desc);
+}
+
/**
* handle_level_irq - Level type irq handler
* @irq: the interrupt number
@@ -362,8 +383,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
handle_irq_event(desc);
- if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT))
- unmask_irq(desc);
+ cond_unmask_irq(desc);
+
out_unlock:
raw_spin_unlock(&desc->lock);
}
@@ -417,6 +438,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
preflow_handler(desc);
handle_irq_event(desc);
+ if (desc->istate & IRQS_ONESHOT)
+ cond_unmask_irq(desc);
+
out_eoi:
desc->irq_data.chip->irq_eoi(&desc->irq_data);
out_unlock:
@@ -625,7 +649,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
irq_settings_set_noprobe(desc);
irq_settings_set_norequest(desc);
irq_settings_set_nothread(desc);
- irq_startup(desc);
+ irq_startup(desc, true);
}
out:
irq_put_desc_busunlock(desc, flags);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index a73dd6c7372..40378ff877e 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,7 +15,7 @@
#define istate core_internal_state__do_not_mess_with_it
-extern int noirqdebug;
+extern bool noirqdebug;
/*
* Bits used by threaded handlers:
@@ -67,7 +67,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
-extern int irq_startup(struct irq_desc *desc);
+extern int irq_startup(struct irq_desc *desc, bool resend);
extern void irq_shutdown(struct irq_desc *desc);
extern void irq_enable(struct irq_desc *desc);
extern void irq_disable(struct irq_desc *desc);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 200ce832c58..1f9e26526b6 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -135,6 +135,9 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
return -EINVAL;
if (intsize < 1)
return -EINVAL;
+ if (d->nr_irq && ((intspec[0] < d->hwirq_base) ||
+ (intspec[0] >= d->hwirq_base + d->nr_irq)))
+ return -EINVAL;
*out_hwirq = intspec[0];
*out_type = IRQ_TYPE_NONE;
@@ -143,11 +146,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
return 0;
}
-struct irq_domain_ops irq_domain_simple_ops = {
- .dt_translate = irq_domain_simple_dt_translate,
-};
-EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
-
/**
* irq_domain_create_simple() - Set up a 'simple' translation range
*/
@@ -182,3 +180,10 @@ void irq_domain_generate_simple(const struct of_device_id *match,
}
EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
#endif /* CONFIG_OF_IRQ */
+
+struct irq_domain_ops irq_domain_simple_ops = {
+#ifdef CONFIG_OF_IRQ
+ .dt_translate = irq_domain_simple_dt_translate,
+#endif /* CONFIG_OF_IRQ */
+};
+EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 67ce837ae52..32313c08444 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -623,8 +623,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
static int irq_wait_for_interrupt(struct irqaction *action)
{
+ set_current_state(TASK_INTERRUPTIBLE);
+
while (!kthread_should_stop()) {
- set_current_state(TASK_INTERRUPTIBLE);
if (test_and_clear_bit(IRQTF_RUNTHREAD,
&action->thread_flags)) {
@@ -632,7 +633,9 @@ static int irq_wait_for_interrupt(struct irqaction *action)
return 0;
}
schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
}
+ __set_current_state(TASK_RUNNING);
return -1;
}
@@ -1024,7 +1027,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
desc->istate |= IRQS_ONESHOT;
if (irq_settings_can_autoenable(desc))
- irq_startup(desc);
+ irq_startup(desc, true);
else
/* Undo nested disables: */
desc->depth = 1;
@@ -1289,7 +1292,7 @@ EXPORT_SYMBOL(free_irq);
* and to set up the interrupt handler in the right order.
*
* If you want to set up a threaded irq handler for your device
- * then you need to supply @handler and @thread_fn. @handler ist
+ * then you need to supply @handler and @thread_fn. @handler is
* still called in hard interrupt context and has to check
* whether the interrupt originates from the device. If yes it
* needs to disable the interrupt on the device and return
@@ -1596,7 +1599,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
return -ENOMEM;
action->handler = handler;
- action->flags = IRQF_PERCPU;
+ action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;
action->name = devname;
action->percpu_dev_id = dev_id;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index b5f4742693c..611cd6003c4 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -84,7 +84,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
*/
action = desc->action;
if (!action || !(action->flags & IRQF_SHARED) ||
- (action->flags & __IRQF_TIMER) || !action->next)
+ (action->flags & __IRQF_TIMER) ||
+ (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
+ !action->next)
goto out;
/* Already running on another processor */
@@ -323,7 +325,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
desc->irqs_unhandled = 0;
}
-int noirqdebug __read_mostly;
+bool noirqdebug __read_mostly;
int noirqdebug_setup(char *str)
{
diff --git a/kernel/itimer.c b/kernel/itimer.c
index d802883153d..22000c3db0d 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
cval = it->expires;
cinterval = it->incr;
- if (!cputime_eq(cval, cputime_zero)) {
+ if (cval) {
struct task_cputime cputime;
cputime_t t;
thread_group_cputimer(tsk, &cputime);
if (clock_id == CPUCLOCK_PROF)
- t = cputime_add(cputime.utime, cputime.stime);
+ t = cputime.utime + cputime.stime;
else
/* CPUCLOCK_VIRT */
t = cputime.utime;
- if (cputime_le(cval, t))
+ if (cval < t)
/* about to fire */
cval = cputime_one_jiffy;
else
- cval = cputime_sub(cval, t);
+ cval = cval - t;
}
spin_unlock_irq(&tsk->sighand->siglock);
@@ -161,10 +161,9 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
cval = it->expires;
cinterval = it->incr;
- if (!cputime_eq(cval, cputime_zero) ||
- !cputime_eq(nval, cputime_zero)) {
- if (cputime_gt(nval, cputime_zero))
- nval = cputime_add(nval, cputime_one_jiffy);
+ if (cval || nval) {
+ if (nval > 0)
+ nval += cputime_one_jiffy;
set_process_cpu_timer(tsk, clock_id, &nval, &cval);
}
it->expires = nval;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index bbdfe2a462a..01d3b70fc98 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -66,19 +66,53 @@ void jump_label_inc(struct jump_label_key *key)
return;
jump_label_lock();
- if (atomic_add_return(1, &key->enabled) == 1)
+ if (atomic_read(&key->enabled) == 0)
jump_label_update(key, JUMP_LABEL_ENABLE);
+ atomic_inc(&key->enabled);
jump_label_unlock();
}
+EXPORT_SYMBOL_GPL(jump_label_inc);
-void jump_label_dec(struct jump_label_key *key)
+static void __jump_label_dec(struct jump_label_key *key,
+ unsigned long rate_limit, struct delayed_work *work)
{
if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
return;
- jump_label_update(key, JUMP_LABEL_DISABLE);
+ if (rate_limit) {
+ atomic_inc(&key->enabled);
+ schedule_delayed_work(work, rate_limit);
+ } else
+ jump_label_update(key, JUMP_LABEL_DISABLE);
+
jump_label_unlock();
}
+EXPORT_SYMBOL_GPL(jump_label_dec);
+
+static void jump_label_update_timeout(struct work_struct *work)
+{
+ struct jump_label_key_deferred *key =
+ container_of(work, struct jump_label_key_deferred, work.work);
+ __jump_label_dec(&key->key, 0, NULL);
+}
+
+void jump_label_dec(struct jump_label_key *key)
+{
+ __jump_label_dec(key, 0, NULL);
+}
+
+void jump_label_dec_deferred(struct jump_label_key_deferred *key)
+{
+ __jump_label_dec(&key->key, key->timeout, &key->work);
+}
+
+
+void jump_label_rate_limit(struct jump_label_key_deferred *key,
+ unsigned long rl)
+{
+ key->timeout = rl;
+ INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
+}
static int addr_conflict(struct jump_entry *entry, void *start, void *end)
{
@@ -110,7 +144,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
* running code can override this to make the non-live update case
* cheaper.
*/
-void __weak arch_jump_label_transform_static(struct jump_entry *entry,
+void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
enum jump_label_type type)
{
arch_jump_label_transform(entry, type);
@@ -216,8 +250,13 @@ void jump_label_apply_nops(struct module *mod)
if (iter_start == iter_stop)
return;
- for (iter = iter_start; iter < iter_stop; iter++)
- arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
+ for (iter = iter_start; iter < iter_stop; iter++) {
+ struct jump_label_key *iterk;
+
+ iterk = (struct jump_label_key *)(unsigned long)iter->key;
+ arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
+ JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
+ }
}
static int jump_label_add_module(struct module *mod)
@@ -257,8 +296,7 @@ static int jump_label_add_module(struct module *mod)
key->next = jlm;
if (jump_label_enabled(key))
- __jump_label_update(key, iter, iter_stop,
- JUMP_LABEL_ENABLE);
+ __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
}
return 0;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index dc7bc082928..7b088678670 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -32,7 +32,6 @@
#include <linux/console.h>
#include <linux/vmalloc.h>
#include <linux/swap.h>
-#include <linux/kmsg_dump.h>
#include <linux/syscore_ops.h>
#include <asm/page.h>
@@ -1094,8 +1093,6 @@ void crash_kexec(struct pt_regs *regs)
if (kexec_crash_image) {
struct pt_regs fixed_regs;
- kmsg_dump(KMSG_DUMP_KEXEC);
-
crash_setup_regs(&fixed_regs, regs);
crash_save_vmcoreinfo();
machine_crash_shutdown(&fixed_regs);
@@ -1132,6 +1129,8 @@ int crash_shrink_memory(unsigned long new_size)
{
int ret = 0;
unsigned long start, end;
+ unsigned long old_size;
+ struct resource *ram_res;
mutex_lock(&kexec_mutex);
@@ -1141,11 +1140,15 @@ int crash_shrink_memory(unsigned long new_size)
}
start = crashk_res.start;
end = crashk_res.end;
+ old_size = (end == 0) ? 0 : end - start + 1;
+ if (new_size >= old_size) {
+ ret = (new_size == old_size) ? 0 : -EINVAL;
+ goto unlock;
+ }
- if (new_size >= end - start + 1) {
- ret = -EINVAL;
- if (new_size == end - start + 1)
- ret = 0;
+ ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
+ if (!ram_res) {
+ ret = -ENOMEM;
goto unlock;
}
@@ -1157,7 +1160,15 @@ int crash_shrink_memory(unsigned long new_size)
if ((start == end) && (crashk_res.parent != NULL))
release_resource(&crashk_res);
+
+ ram_res->start = end;
+ ram_res->end = crashk_res.end;
+ ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
+ ram_res->name = "System RAM";
+
crashk_res.end = end - 1;
+
+ insert_resource(&iomem_resource, ram_res);
crash_unmap_reserved_pages();
unlock:
@@ -1523,7 +1534,7 @@ int kernel_kexec(void)
#ifdef CONFIG_KEXEC_JUMP
if (kexec_image->preserve_context) {
- mutex_lock(&pm_mutex);
+ lock_system_sleep();
pm_prepare_console();
error = freeze_processes();
if (error) {
@@ -1576,7 +1587,7 @@ int kernel_kexec(void)
thaw_processes();
Restore_console:
pm_restore_console();
- mutex_unlock(&pm_mutex);
+ unlock_system_sleep();
}
#endif
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a4bea97c75b..a0a88543934 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -36,6 +36,7 @@
#include <linux/resource.h>
#include <linux/notifier.h>
#include <linux/suspend.h>
+#include <linux/rwsem.h>
#include <asm/uaccess.h>
#include <trace/events/module.h>
@@ -50,6 +51,7 @@ static struct workqueue_struct *khelper_wq;
static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
static DEFINE_SPINLOCK(umh_sysctl_lock);
+static DECLARE_RWSEM(umhelper_sem);
#ifdef CONFIG_MODULES
@@ -275,6 +277,7 @@ static void __call_usermodehelper(struct work_struct *work)
* If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
* (used for preventing user land processes from being created after the user
* land has been frozen during a system-wide hibernation or suspend operation).
+ * Should always be manipulated under umhelper_sem acquired for write.
*/
static int usermodehelper_disabled = 1;
@@ -282,17 +285,29 @@ static int usermodehelper_disabled = 1;
static atomic_t running_helpers = ATOMIC_INIT(0);
/*
- * Wait queue head used by usermodehelper_pm_callback() to wait for all running
+ * Wait queue head used by usermodehelper_disable() to wait for all running
* helpers to finish.
*/
static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
/*
* Time to wait for running_helpers to become zero before the setting of
- * usermodehelper_disabled in usermodehelper_pm_callback() fails
+ * usermodehelper_disabled in usermodehelper_disable() fails
*/
#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
+void read_lock_usermodehelper(void)
+{
+ down_read(&umhelper_sem);
+}
+EXPORT_SYMBOL_GPL(read_lock_usermodehelper);
+
+void read_unlock_usermodehelper(void)
+{
+ up_read(&umhelper_sem);
+}
+EXPORT_SYMBOL_GPL(read_unlock_usermodehelper);
+
/**
* usermodehelper_disable - prevent new helpers from being started
*/
@@ -300,8 +315,10 @@ int usermodehelper_disable(void)
{
long retval;
+ down_write(&umhelper_sem);
usermodehelper_disabled = 1;
- smp_mb();
+ up_write(&umhelper_sem);
+
/*
* From now on call_usermodehelper_exec() won't start any new
* helpers, so it is sufficient if running_helpers turns out to
@@ -314,7 +331,9 @@ int usermodehelper_disable(void)
if (retval)
return 0;
+ down_write(&umhelper_sem);
usermodehelper_disabled = 0;
+ up_write(&umhelper_sem);
return -EAGAIN;
}
@@ -323,7 +342,9 @@ int usermodehelper_disable(void)
*/
void usermodehelper_enable(void)
{
+ down_write(&umhelper_sem);
usermodehelper_disabled = 0;
+ up_write(&umhelper_sem);
}
/**
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5d84644823..9788c0ec6f4 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1077,6 +1077,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
/* Early boot. kretprobe_table_locks not yet initialized. */
return;
+ INIT_HLIST_HEAD(&empty_rp);
hash = hash_ptr(tk, KPROBE_HASH_BITS);
head = &kretprobe_inst_table[hash];
kretprobe_table_lock(hash, &flags);
@@ -1085,7 +1086,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
recycle_rp_inst(ri, &empty_rp);
}
kretprobe_table_unlock(hash, &flags);
- INIT_HLIST_HEAD(&empty_rp);
hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
kfree(ri);
@@ -1673,8 +1673,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
ri->rp = rp;
ri->task = current;
- if (rp->entry_handler && rp->entry_handler(ri, regs))
+ if (rp->entry_handler && rp->entry_handler(ri, regs)) {
+ raw_spin_lock_irqsave(&rp->lock, flags);
+ hlist_add_head(&ri->hlist, &rp->free_instances);
+ raw_spin_unlock_irqrestore(&rp->lock, flags);
return 0;
+ }
arch_prepare_kretprobe(ri, regs);
@@ -2198,7 +2202,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
const char __user *user_buf, size_t count, loff_t *ppos)
{
char buf[32];
- int buf_size;
+ size_t buf_size;
buf_size = min(count, (sizeof(buf)-1));
if (copy_from_user(buf, user_buf, buf_size))
diff --git a/kernel/kthread.c b/kernel/kthread.c
index b6d216a9263..3d3de633702 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -59,6 +59,31 @@ int kthread_should_stop(void)
EXPORT_SYMBOL(kthread_should_stop);
/**
+ * kthread_freezable_should_stop - should this freezable kthread return now?
+ * @was_frozen: optional out parameter, indicates whether %current was frozen
+ *
+ * kthread_should_stop() for freezable kthreads, which will enter
+ * refrigerator if necessary. This function is safe from kthread_stop() /
+ * freezer deadlock and freezable kthreads should use this function instead
+ * of calling try_to_freeze() directly.
+ */
+bool kthread_freezable_should_stop(bool *was_frozen)
+{
+ bool frozen = false;
+
+ might_sleep();
+
+ if (unlikely(freezing(current)))
+ frozen = __refrigerator(true);
+
+ if (was_frozen)
+ *was_frozen = frozen;
+
+ return kthread_should_stop();
+}
+EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
+
+/**
* kthread_data - return data value specified on kthread creation
* @task: kthread task in question
*
@@ -257,7 +282,7 @@ int kthreadd(void *unused)
set_cpus_allowed_ptr(tsk, cpu_all_mask);
set_mems_allowed(node_states[N_HIGH_MEMORY]);
- current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
+ current->flags |= PF_NOFREEZE;
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e69434b070d..8889f7dd7c4 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -44,6 +44,7 @@
#include <linux/stringify.h>
#include <linux/bitops.h>
#include <linux/gfp.h>
+#include <linux/kmemcheck.h>
#include <asm/sections.h>
@@ -430,6 +431,7 @@ unsigned int max_lockdep_depth;
* about it later on, in lockdep_info().
*/
static int lockdep_init_error;
+static const char *lock_init_error;
static unsigned long lockdep_init_trace_data[20];
static struct stack_trace lockdep_init_trace = {
.max_entries = ARRAY_SIZE(lockdep_init_trace_data),
@@ -498,36 +500,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
usage[i] = '\0';
}
-static int __print_lock_name(struct lock_class *class)
+static void __print_lock_name(struct lock_class *class)
{
char str[KSYM_NAME_LEN];
const char *name;
name = class->name;
- if (!name)
- name = __get_key_name(class->key, str);
-
- return printk("%s", name);
-}
-
-static void print_lock_name(struct lock_class *class)
-{
- char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
- const char *name;
-
- get_usage_chars(class, usage);
-
- name = class->name;
if (!name) {
name = __get_key_name(class->key, str);
- printk(" (%s", name);
+ printk("%s", name);
} else {
- printk(" (%s", name);
+ printk("%s", name);
if (class->name_version > 1)
printk("#%d", class->name_version);
if (class->subclass)
printk("/%d", class->subclass);
}
+}
+
+static void print_lock_name(struct lock_class *class)
+{
+ char usage[LOCK_USAGE_CHARS];
+
+ get_usage_chars(class, usage);
+
+ printk(" (");
+ __print_lock_name(class);
printk("){%s}", usage);
}
@@ -567,11 +565,12 @@ static void lockdep_print_held_locks(struct task_struct *curr)
}
}
-static void print_kernel_version(void)
+static void print_kernel_ident(void)
{
- printk("%s %.*s\n", init_utsname()->release,
+ printk("%s %.*s %s\n", init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
- init_utsname()->version);
+ init_utsname()->version,
+ print_tainted());
}
static int very_verbose(struct lock_class *class)
@@ -655,6 +654,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
if (unlikely(!lockdep_initialized)) {
lockdep_init();
lockdep_init_error = 1;
+ lock_init_error = lock->name;
save_stack_trace(&lockdep_init_trace);
}
#endif
@@ -722,7 +722,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
class = look_up_lock_class(lock, subclass);
if (likely(class))
- return class;
+ goto out_set_class_cache;
/*
* Debug-check: all keys must be persistent!
@@ -807,6 +807,7 @@ out_unlock_set:
graph_unlock();
raw_local_irq_restore(flags);
+out_set_class_cache:
if (!subclass || force)
lock->class_cache[0] = class;
else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
@@ -1148,7 +1149,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
printk("\n");
printk("======================================================\n");
printk("[ INFO: possible circular locking dependency detected ]\n");
- print_kernel_version();
+ print_kernel_ident();
printk("-------------------------------------------------------\n");
printk("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
@@ -1487,7 +1488,7 @@ print_bad_irq_dependency(struct task_struct *curr,
printk("======================================================\n");
printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
irqclass, irqclass);
- print_kernel_version();
+ print_kernel_ident();
printk("------------------------------------------------------\n");
printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
curr->comm, task_pid_nr(curr),
@@ -1716,7 +1717,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
printk("\n");
printk("=============================================\n");
printk("[ INFO: possible recursive locking detected ]\n");
- print_kernel_version();
+ print_kernel_ident();
printk("---------------------------------------------\n");
printk("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
@@ -2223,7 +2224,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
printk("\n");
printk("=================================\n");
printk("[ INFO: inconsistent lock state ]\n");
- print_kernel_version();
+ print_kernel_ident();
printk("---------------------------------\n");
printk("inconsistent {%s} -> {%s} usage.\n",
@@ -2288,7 +2289,7 @@ print_irq_inversion_bug(struct task_struct *curr,
printk("\n");
printk("=========================================================\n");
printk("[ INFO: possible irq lock inversion dependency detected ]\n");
- print_kernel_version();
+ print_kernel_ident();
printk("---------------------------------------------------------\n");
printk("%s/%d just changed the state of lock:\n",
curr->comm, task_pid_nr(curr));
@@ -2948,7 +2949,12 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
void lockdep_init_map(struct lockdep_map *lock, const char *name,
struct lock_class_key *key, int subclass)
{
- memset(lock, 0, sizeof(*lock));
+ int i;
+
+ kmemcheck_mark_initialized(lock, sizeof(*lock));
+
+ for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
+ lock->class_cache[i] = NULL;
#ifdef CONFIG_LOCK_STAT
lock->cpu = raw_smp_processor_id();
@@ -3169,6 +3175,7 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
printk("\n");
printk("=====================================\n");
printk("[ BUG: bad unlock balance detected! ]\n");
+ print_kernel_ident();
printk("-------------------------------------\n");
printk("%s/%d is trying to release lock (",
curr->comm, task_pid_nr(curr));
@@ -3613,6 +3620,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
printk("\n");
printk("=================================\n");
printk("[ BUG: bad contention detected! ]\n");
+ print_kernel_ident();
printk("---------------------------------\n");
printk("%s/%d is trying to contend lock (",
curr->comm, task_pid_nr(curr));
@@ -3968,7 +3976,8 @@ void __init lockdep_info(void)
#ifdef CONFIG_DEBUG_LOCKDEP
if (lockdep_init_error) {
- printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n");
+ printk("WARNING: lockdep init error! lock-%s was acquired"
+ "before lockdep_init\n", lock_init_error);
printk("Call stack leading to lockdep invocation was:\n");
print_stack_trace(&lockdep_init_trace, 0);
}
@@ -3987,6 +3996,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
printk("\n");
printk("=========================\n");
printk("[ BUG: held lock freed! ]\n");
+ print_kernel_ident();
printk("-------------------------\n");
printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
@@ -4044,6 +4054,7 @@ static void print_held_locks_bug(struct task_struct *curr)
printk("\n");
printk("=====================================\n");
printk("[ BUG: lock held at task exit time! ]\n");
+ print_kernel_ident();
printk("-------------------------------------\n");
printk("%s/%d is exiting with locks still held!\n",
curr->comm, task_pid_nr(curr));
@@ -4141,6 +4152,7 @@ void lockdep_sys_exit(void)
printk("\n");
printk("================================================\n");
printk("[ BUG: lock held when returning to user space! ]\n");
+ print_kernel_ident();
printk("------------------------------------------------\n");
printk("%s/%d is leaving the kernel with locks still held!\n",
curr->comm, curr->pid);
@@ -4160,10 +4172,33 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
printk("\n");
printk("===============================\n");
printk("[ INFO: suspicious RCU usage. ]\n");
+ print_kernel_ident();
printk("-------------------------------\n");
printk("%s:%d %s!\n", file, line, s);
printk("\nother info that might help us debug this:\n\n");
printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
+
+ /*
+ * If a CPU is in the RCU-free window in idle (ie: in the section
+ * between rcu_idle_enter() and rcu_idle_exit(), then RCU
+ * considers that CPU to be in an "extended quiescent state",
+ * which means that RCU will be completely ignoring that CPU.
+ * Therefore, rcu_read_lock() and friends have absolutely no
+ * effect on a CPU running in that state. In other words, even if
+ * such an RCU-idle CPU has called rcu_read_lock(), RCU might well
+ * delete data structures out from under it. RCU really has no
+ * choice here: we need to keep an RCU-free window in idle where
+ * the CPU may possibly enter into low power mode. This way we can
+ * notice an extended quiescent state to other CPUs that started a grace
+ * period. Otherwise we would delay any grace period as long as we run
+ * in the idle task.
+ *
+ * So complain bitterly if someone does call rcu_read_lock(),
+ * rcu_read_lock_bh() and so on from extended quiescent states.
+ */
+ if (rcu_is_cpu_idle())
+ printk("RCU used illegally from extended quiescent state!\n");
+
lockdep_print_held_locks(curr);
printk("\nstack backtrace:\n");
dump_stack();
diff --git a/kernel/module.c b/kernel/module.c
index 178333c48d1..2c932760fd3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -62,12 +62,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/module.h>
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt , a...)
-#endif
-
#ifndef ARCH_SHF_SMALL
#define ARCH_SHF_SMALL 0
#endif
@@ -138,7 +132,6 @@ struct load_info {
unsigned long len;
Elf_Shdr *sechdrs;
char *secstrings, *strtab;
- unsigned long *strmap;
unsigned long symoffs, stroffs;
struct _ddebug *debug;
unsigned int num_debug;
@@ -410,7 +403,7 @@ const struct kernel_symbol *find_symbol(const char *name,
return fsa.sym;
}
- DEBUGP("Failed to find symbol %s\n", name);
+ pr_debug("Failed to find symbol %s\n", name);
return NULL;
}
EXPORT_SYMBOL_GPL(find_symbol);
@@ -600,11 +593,11 @@ static int already_uses(struct module *a, struct module *b)
list_for_each_entry(use, &b->source_list, source_list) {
if (use->source == a) {
- DEBUGP("%s uses %s!\n", a->name, b->name);
+ pr_debug("%s uses %s!\n", a->name, b->name);
return 1;
}
}
- DEBUGP("%s does not use %s!\n", a->name, b->name);
+ pr_debug("%s does not use %s!\n", a->name, b->name);
return 0;
}
@@ -619,7 +612,7 @@ static int add_module_usage(struct module *a, struct module *b)
{
struct module_use *use;
- DEBUGP("Allocating new usage for %s.\n", a->name);
+ pr_debug("Allocating new usage for %s.\n", a->name);
use = kmalloc(sizeof(*use), GFP_ATOMIC);
if (!use) {
printk(KERN_WARNING "%s: out of memory loading\n", a->name);
@@ -663,7 +656,7 @@ static void module_unload_free(struct module *mod)
mutex_lock(&module_mutex);
list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
struct module *i = use->target;
- DEBUGP("%s unusing %s\n", mod->name, i->name);
+ pr_debug("%s unusing %s\n", mod->name, i->name);
module_put(i);
list_del(&use->source_list);
list_del(&use->target_list);
@@ -726,9 +719,9 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
}
}
-unsigned int module_refcount(struct module *mod)
+unsigned long module_refcount(struct module *mod)
{
- unsigned int incs = 0, decs = 0;
+ unsigned long incs = 0, decs = 0;
int cpu;
for_each_possible_cpu(cpu)
@@ -761,7 +754,7 @@ static void wait_for_zero_refcount(struct module *mod)
/* Since we might sleep for some time, release the mutex first */
mutex_unlock(&module_mutex);
for (;;) {
- DEBUGP("Looking at refcount...\n");
+ pr_debug("Looking at refcount...\n");
set_current_state(TASK_UNINTERRUPTIBLE);
if (module_refcount(mod) == 0)
break;
@@ -804,7 +797,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
if (mod->state != MODULE_STATE_LIVE) {
/* FIXME: if (force), slam module count and wake up
waiter --RR */
- DEBUGP("%s already dying\n", mod->name);
+ pr_debug("%s already dying\n", mod->name);
ret = -EBUSY;
goto out;
}
@@ -854,7 +847,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
struct module_use *use;
int printed_something = 0;
- seq_printf(m, " %u ", module_refcount(mod));
+ seq_printf(m, " %lu ", module_refcount(mod));
/* Always include a trailing , so userspace can differentiate
between this and the old multi-field proc format. */
@@ -904,13 +897,11 @@ EXPORT_SYMBOL_GPL(symbol_put_addr);
static ssize_t show_refcnt(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
- return sprintf(buffer, "%u\n", module_refcount(mk->mod));
+ return sprintf(buffer, "%lu\n", module_refcount(mk->mod));
}
-static struct module_attribute refcnt = {
- .attr = { .name = "refcnt", .mode = 0444 },
- .show = show_refcnt,
-};
+static struct module_attribute modinfo_refcnt =
+ __ATTR(refcnt, 0444, show_refcnt, NULL);
void module_put(struct module *module)
{
@@ -951,6 +942,26 @@ static inline int module_unload_init(struct module *mod)
}
#endif /* CONFIG_MODULE_UNLOAD */
+static size_t module_flags_taint(struct module *mod, char *buf)
+{
+ size_t l = 0;
+
+ if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
+ buf[l++] = 'P';
+ if (mod->taints & (1 << TAINT_OOT_MODULE))
+ buf[l++] = 'O';
+ if (mod->taints & (1 << TAINT_FORCED_MODULE))
+ buf[l++] = 'F';
+ if (mod->taints & (1 << TAINT_CRAP))
+ buf[l++] = 'C';
+ /*
+ * TAINT_FORCED_RMMOD: could be added.
+ * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
+ * apply to modules.
+ */
+ return l;
+}
+
static ssize_t show_initstate(struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
@@ -970,10 +981,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
return sprintf(buffer, "%s\n", state);
}
-static struct module_attribute initstate = {
- .attr = { .name = "initstate", .mode = 0444 },
- .show = show_initstate,
-};
+static struct module_attribute modinfo_initstate =
+ __ATTR(initstate, 0444, show_initstate, NULL);
static ssize_t store_uevent(struct module_attribute *mattr,
struct module_kobject *mk,
@@ -986,18 +995,50 @@ static ssize_t store_uevent(struct module_attribute *mattr,
return count;
}
-struct module_attribute module_uevent = {
- .attr = { .name = "uevent", .mode = 0200 },
- .store = store_uevent,
-};
+struct module_attribute module_uevent =
+ __ATTR(uevent, 0200, NULL, store_uevent);
+
+static ssize_t show_coresize(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ return sprintf(buffer, "%u\n", mk->mod->core_size);
+}
+
+static struct module_attribute modinfo_coresize =
+ __ATTR(coresize, 0444, show_coresize, NULL);
+
+static ssize_t show_initsize(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ return sprintf(buffer, "%u\n", mk->mod->init_size);
+}
+
+static struct module_attribute modinfo_initsize =
+ __ATTR(initsize, 0444, show_initsize, NULL);
+
+static ssize_t show_taint(struct module_attribute *mattr,
+ struct module_kobject *mk, char *buffer)
+{
+ size_t l;
+
+ l = module_flags_taint(mk->mod, buffer);
+ buffer[l++] = '\n';
+ return l;
+}
+
+static struct module_attribute modinfo_taint =
+ __ATTR(taint, 0444, show_taint, NULL);
static struct module_attribute *modinfo_attrs[] = {
+ &module_uevent,
&modinfo_version,
&modinfo_srcversion,
- &initstate,
- &module_uevent,
+ &modinfo_initstate,
+ &modinfo_coresize,
+ &modinfo_initsize,
+ &modinfo_taint,
#ifdef CONFIG_MODULE_UNLOAD
- &refcnt,
+ &modinfo_refcnt,
#endif
NULL,
};
@@ -1057,7 +1098,7 @@ static int check_version(Elf_Shdr *sechdrs,
if (versions[i].crc == maybe_relocated(*crc, crc_owner))
return 1;
- DEBUGP("Found checksum %lX vs module %lX\n",
+ pr_debug("Found checksum %lX vs module %lX\n",
maybe_relocated(*crc, crc_owner), versions[i].crc);
goto bad_version;
}
@@ -1834,7 +1875,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
case SHN_COMMON:
/* We compiled with -fno-common. These are not
supposed to happen. */
- DEBUGP("Common symbol: %s\n", name);
+ pr_debug("Common symbol: %s\n", name);
printk("%s: please compile with -fno-common\n",
mod->name);
ret = -ENOEXEC;
@@ -1842,7 +1883,7 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
case SHN_ABS:
/* Don't need to do anything */
- DEBUGP("Absolute symbol: 0x%08lx\n",
+ pr_debug("Absolute symbol: 0x%08lx\n",
(long)sym[i].st_value);
break;
@@ -1966,7 +2007,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
for (i = 0; i < info->hdr->e_shnum; i++)
info->sechdrs[i].sh_entsize = ~0UL;
- DEBUGP("Core section allocation order:\n");
+ pr_debug("Core section allocation order:\n");
for (m = 0; m < ARRAY_SIZE(masks); ++m) {
for (i = 0; i < info->hdr->e_shnum; ++i) {
Elf_Shdr *s = &info->sechdrs[i];
@@ -1978,7 +2019,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
|| strstarts(sname, ".init"))
continue;
s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
- DEBUGP("\t%s\n", name);
+ pr_debug("\t%s\n", sname);
}
switch (m) {
case 0: /* executable */
@@ -1995,7 +2036,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
}
}
- DEBUGP("Init section allocation order:\n");
+ pr_debug("Init section allocation order:\n");
for (m = 0; m < ARRAY_SIZE(masks); ++m) {
for (i = 0; i < info->hdr->e_shnum; ++i) {
Elf_Shdr *s = &info->sechdrs[i];
@@ -2008,7 +2049,7 @@ static void layout_sections(struct module *mod, struct load_info *info)
continue;
s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
| INIT_OFFSET_MASK);
- DEBUGP("\t%s\n", sname);
+ pr_debug("\t%s\n", sname);
}
switch (m) {
case 0: /* executable */
@@ -2178,45 +2219,46 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs,
return true;
}
+/*
+ * We only allocate and copy the strings needed by the parts of symtab
+ * we keep. This is simple, but has the effect of making multiple
+ * copies of duplicates. We could be more sophisticated, see
+ * linux-kernel thread starting with
+ * <73defb5e4bca04a6431392cc341112b1@localhost>.
+ */
static void layout_symtab(struct module *mod, struct load_info *info)
{
Elf_Shdr *symsect = info->sechdrs + info->index.sym;
Elf_Shdr *strsect = info->sechdrs + info->index.str;
const Elf_Sym *src;
- unsigned int i, nsrc, ndst;
+ unsigned int i, nsrc, ndst, strtab_size;
/* Put symbol section at end of init part of module. */
symsect->sh_flags |= SHF_ALLOC;
symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect,
info->index.sym) | INIT_OFFSET_MASK;
- DEBUGP("\t%s\n", info->secstrings + symsect->sh_name);
+ pr_debug("\t%s\n", info->secstrings + symsect->sh_name);
src = (void *)info->hdr + symsect->sh_offset;
nsrc = symsect->sh_size / sizeof(*src);
- for (ndst = i = 1; i < nsrc; ++i, ++src)
- if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
- unsigned int j = src->st_name;
- while (!__test_and_set_bit(j, info->strmap)
- && info->strtab[j])
- ++j;
- ++ndst;
+ /* Compute total space required for the core symbols' strtab. */
+ for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src)
+ if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) {
+ strtab_size += strlen(&info->strtab[src->st_name]) + 1;
+ ndst++;
}
/* Append room for core symbols at end of core part. */
info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1);
- mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
+ info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym);
+ mod->core_size += strtab_size;
/* Put string table section at end of init part of module. */
strsect->sh_flags |= SHF_ALLOC;
strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect,
info->index.str) | INIT_OFFSET_MASK;
- DEBUGP("\t%s\n", info->secstrings + strsect->sh_name);
-
- /* Append room for core symbols' strings at end of core part. */
- info->stroffs = mod->core_size;
- __set_bit(0, info->strmap);
- mod->core_size += bitmap_weight(info->strmap, strsect->sh_size);
+ pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
}
static void add_kallsyms(struct module *mod, const struct load_info *info)
@@ -2237,22 +2279,19 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
mod->core_symtab = dst = mod->module_core + info->symoffs;
+ mod->core_strtab = s = mod->module_core + info->stroffs;
src = mod->symtab;
*dst = *src;
+ *s++ = 0;
for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) {
if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum))
continue;
+
dst[ndst] = *src;
- dst[ndst].st_name = bitmap_weight(info->strmap,
- dst[ndst].st_name);
- ++ndst;
+ dst[ndst++].st_name = s - mod->core_strtab;
+ s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1;
}
mod->core_num_syms = ndst;
-
- mod->core_strtab = s = mod->module_core + info->stroffs;
- for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i)
- if (test_bit(i, info->strmap))
- *++s = mod->strtab[i];
}
#else
static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -2621,7 +2660,7 @@ static int move_module(struct module *mod, struct load_info *info)
mod->module_init = ptr;
/* Transfer each section which specifies SHF_ALLOC */
- DEBUGP("final section addresses:\n");
+ pr_debug("final section addresses:\n");
for (i = 0; i < info->hdr->e_shnum; i++) {
void *dest;
Elf_Shdr *shdr = &info->sechdrs[i];
@@ -2639,8 +2678,8 @@ static int move_module(struct module *mod, struct load_info *info)
memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size);
/* Update sh_addr to point to copy in image. */
shdr->sh_addr = (unsigned long)dest;
- DEBUGP("\t0x%lx %s\n",
- shdr->sh_addr, info->secstrings + shdr->sh_name);
+ pr_debug("\t0x%lx %s\n",
+ (long)shdr->sh_addr, info->secstrings + shdr->sh_name);
}
return 0;
@@ -2742,27 +2781,18 @@ static struct module *layout_and_allocate(struct load_info *info)
this is done generically; there doesn't appear to be any
special cases for the architectures. */
layout_sections(mod, info);
-
- info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size)
- * sizeof(long), GFP_KERNEL);
- if (!info->strmap) {
- err = -ENOMEM;
- goto free_percpu;
- }
layout_symtab(mod, info);
/* Allocate and move to the final place */
err = move_module(mod, info);
if (err)
- goto free_strmap;
+ goto free_percpu;
/* Module has been copied to its final place now: return it. */
mod = (void *)info->sechdrs[info->index.mod].sh_addr;
kmemleak_load_module(mod, info);
return mod;
-free_strmap:
- kfree(info->strmap);
free_percpu:
percpu_modfree(mod);
out:
@@ -2772,7 +2802,6 @@ out:
/* mod is no longer valid after this! */
static void module_deallocate(struct module *mod, struct load_info *info)
{
- kfree(info->strmap);
percpu_modfree(mod);
module_free(mod, mod->module_init);
module_free(mod, mod->module_core);
@@ -2811,7 +2840,7 @@ static struct module *load_module(void __user *umod,
struct module *mod;
long err;
- DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
+ pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
umod, len, uargs);
/* Copy in the blobs from userspace, check they are vaguely sane. */
@@ -2902,8 +2931,7 @@ static struct module *load_module(void __user *umod,
if (err < 0)
goto unlink;
- /* Get rid of temporary copy and strmap. */
- kfree(info.strmap);
+ /* Get rid of temporary copy. */
free_copy(&info);
/* Done! */
@@ -3256,20 +3284,7 @@ static char *module_flags(struct module *mod, char *buf)
mod->state == MODULE_STATE_GOING ||
mod->state == MODULE_STATE_COMING) {
buf[bx++] = '(';
- if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
- buf[bx++] = 'P';
- else if (mod->taints & (1 << TAINT_OOT_MODULE))
- buf[bx++] = 'O';
- if (mod->taints & (1 << TAINT_FORCED_MODULE))
- buf[bx++] = 'F';
- if (mod->taints & (1 << TAINT_CRAP))
- buf[bx++] = 'C';
- /*
- * TAINT_FORCED_RMMOD: could be added.
- * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
- * apply to modules.
- */
-
+ bx += module_flags_taint(mod, buf + bx);
/* Show a - for module-is-being-unloaded */
if (mod->state == MODULE_STATE_GOING)
buf[bx++] = '-';
diff --git a/kernel/panic.c b/kernel/panic.c
index b2659360421..80aed44e345 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -49,6 +49,15 @@ static long no_blink(int state)
long (*panic_blink)(int state);
EXPORT_SYMBOL(panic_blink);
+/*
+ * Stop ourself in panic -- architecture code may override this
+ */
+void __weak panic_smp_self_stop(void)
+{
+ while (1)
+ cpu_relax();
+}
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -57,8 +66,9 @@ EXPORT_SYMBOL(panic_blink);
*
* This function never returns.
*/
-NORET_TYPE void panic(const char * fmt, ...)
+void panic(const char *fmt, ...)
{
+ static DEFINE_SPINLOCK(panic_lock);
static char buf[1024];
va_list args;
long i, i_next = 0;
@@ -68,8 +78,14 @@ NORET_TYPE void panic(const char * fmt, ...)
* It's possible to come here directly from a panic-assertion and
* not have preempt disabled. Some functions called from here want
* preempt to be disabled. No point enabling it later though...
+ *
+ * Only one CPU is allowed to execute the panic code from here. For
+ * multiple parallel invocations of panic, all other CPUs either
+ * stop themself or will wait until they are stopped by the 1st CPU
+ * with smp_send_stop().
*/
- preempt_disable();
+ if (!spin_trylock(&panic_lock))
+ panic_smp_self_stop();
console_verbose();
bust_spinlocks(1);
@@ -78,7 +94,11 @@ NORET_TYPE void panic(const char * fmt, ...)
va_end(args);
printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
#ifdef CONFIG_DEBUG_BUGVERBOSE
- dump_stack();
+ /*
+ * Avoid nested stack-dumping if a panic occurs during oops processing
+ */
+ if (!oops_in_progress)
+ dump_stack();
#endif
/*
@@ -237,11 +257,20 @@ void add_taint(unsigned flag)
* Can't trust the integrity of the kernel anymore.
* We don't call directly debug_locks_off() because the issue
* is not necessarily serious enough to set oops_in_progress to 1
- * Also we want to keep up lockdep for staging development and
- * post-warning case.
+ * Also we want to keep up lockdep for staging/out-of-tree
+ * development and post-warning case.
*/
- if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off())
- printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
+ switch (flag) {
+ case TAINT_CRAP:
+ case TAINT_OOT_MODULE:
+ case TAINT_WARN:
+ case TAINT_FIRMWARE_WORKAROUND:
+ break;
+
+ default:
+ if (__debug_locks_off())
+ printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
+ }
set_bit(flag, &tainted_mask);
}
diff --git a/kernel/params.c b/kernel/params.c
index 65aae11eb93..4bc965d8a1f 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -25,12 +25,6 @@
#include <linux/slab.h>
#include <linux/ctype.h>
-#if 0
-#define DEBUGP printk
-#else
-#define DEBUGP(fmt, a...)
-#endif
-
/* Protects all parameters, and incidentally kmalloced_param list. */
static DEFINE_MUTEX(param_lock);
@@ -103,9 +97,10 @@ static int parse_one(char *param,
for (i = 0; i < num_params; i++) {
if (parameq(param, params[i].name)) {
/* No one handled NULL, so do it here. */
- if (!val && params[i].ops->set != param_set_bool)
+ if (!val && params[i].ops->set != param_set_bool
+ && params[i].ops->set != param_set_bint)
return -EINVAL;
- DEBUGP("They are equal! Calling %p\n",
+ pr_debug("They are equal! Calling %p\n",
params[i].ops->set);
mutex_lock(&param_lock);
err = params[i].ops->set(val, &params[i]);
@@ -115,11 +110,11 @@ static int parse_one(char *param,
}
if (handle_unknown) {
- DEBUGP("Unknown argument: calling %p\n", handle_unknown);
+ pr_debug("Unknown argument: calling %p\n", handle_unknown);
return handle_unknown(param, val);
}
- DEBUGP("Unknown argument `%s'\n", param);
+ pr_debug("Unknown argument `%s'\n", param);
return -ENOENT;
}
@@ -184,7 +179,7 @@ int parse_args(const char *name,
{
char *param, *val;
- DEBUGP("Parsing ARGS: %s\n", args);
+ pr_debug("Parsing ARGS: %s\n", args);
/* Chew leading spaces */
args = skip_spaces(args);
@@ -369,6 +364,30 @@ struct kernel_param_ops param_ops_invbool = {
};
EXPORT_SYMBOL(param_ops_invbool);
+int param_set_bint(const char *val, const struct kernel_param *kp)
+{
+ struct kernel_param boolkp;
+ bool v;
+ int ret;
+
+ /* Match bool exactly, by re-using it. */
+ boolkp = *kp;
+ boolkp.arg = &v;
+ boolkp.flags |= KPARAM_ISBOOL;
+
+ ret = param_set_bool(val, &boolkp);
+ if (ret == 0)
+ *(int *)kp->arg = v;
+ return ret;
+}
+EXPORT_SYMBOL(param_set_bint);
+
+struct kernel_param_ops param_ops_bint = {
+ .set = param_set_bint,
+ .get = param_get_int,
+};
+EXPORT_SYMBOL(param_ops_bint);
+
/* We break the rule and mangle the string. */
static int param_array(const char *name,
const char *val,
diff --git a/kernel/pid.c b/kernel/pid.c
index fa5f72227e5..9f08dfabaf1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -137,7 +137,9 @@ static int pid_before(int base, int a, int b)
}
/*
- * We might be racing with someone else trying to set pid_ns->last_pid.
+ * We might be racing with someone else trying to set pid_ns->last_pid
+ * at the pid allocation time (there's also a sysctl for this, but racing
+ * with this one is OK, see comment in kernel/pid_namespace.c about it).
* We want the winner to have the "later" value, because if the
* "earlier" value prevails, then a pid may get reused immediately.
*
@@ -541,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
*/
void __init pidhash_init(void)
{
- int i, pidhash_size;
+ unsigned int i, pidhash_size;
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL,
&pidhash_shift, NULL, 4096);
- pidhash_size = 1 << pidhash_shift;
+ pidhash_size = 1U << pidhash_shift;
for (i = 0; i < pidhash_size; i++)
INIT_HLIST_HEAD(&pid_hash[i]);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index e9c9adc84ca..a8968396046 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -191,9 +191,40 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
return;
}
+static int pid_ns_ctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp = *table;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * Writing directly to ns' last_pid field is OK, since this field
+ * is volatile in a living namespace anyway and a code writing to
+ * it should synchronize its usage with external means.
+ */
+
+ tmp.data = &current->nsproxy->pid_ns->last_pid;
+ return proc_dointvec(&tmp, write, buffer, lenp, ppos);
+}
+
+static struct ctl_table pid_ns_ctl_table[] = {
+ {
+ .procname = "ns_last_pid",
+ .maxlen = sizeof(int),
+ .mode = 0666, /* permissions are checked in the handler */
+ .proc_handler = pid_ns_ctl_handler,
+ },
+ { }
+};
+
+static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
+
static __init int pid_namespaces_init(void)
{
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
+ register_sysctl_paths(kern_path, pid_ns_ctl_table);
return 0;
}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e7cb76dc18f..125cb67daa2 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock,
if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
return now.sched < then.sched;
} else {
- return cputime_lt(now.cpu, then.cpu);
+ return now.cpu < then.cpu;
}
}
static inline void cpu_time_add(const clockid_t which_clock,
@@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock,
if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
acc->sched += val.sched;
} else {
- acc->cpu = cputime_add(acc->cpu, val.cpu);
+ acc->cpu += val.cpu;
}
}
static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
@@ -98,25 +98,12 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
a.sched -= b.sched;
} else {
- a.cpu = cputime_sub(a.cpu, b.cpu);
+ a.cpu -= b.cpu;
}
return a;
}
/*
- * Divide and limit the result to res >= 1
- *
- * This is necessary to prevent signal delivery starvation, when the result of
- * the division would be rounded down to 0.
- */
-static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
-{
- cputime_t res = cputime_div(time, div);
-
- return max_t(cputime_t, res, 1);
-}
-
-/*
* Update expiry time from increment, and increase overrun count,
* given the current clock sample.
*/
@@ -148,28 +135,26 @@ static void bump_cpu_timer(struct k_itimer *timer,
} else {
cputime_t delta, incr;
- if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu))
+ if (now.cpu < timer->it.cpu.expires.cpu)
return;
incr = timer->it.cpu.incr.cpu;
- delta = cputime_sub(cputime_add(now.cpu, incr),
- timer->it.cpu.expires.cpu);
+ delta = now.cpu + incr - timer->it.cpu.expires.cpu;
/* Don't use (incr*2 < delta), incr*2 might overflow. */
- for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
- incr = cputime_add(incr, incr);
- for (; i >= 0; incr = cputime_halve(incr), i--) {
- if (cputime_lt(delta, incr))
+ for (i = 0; incr < delta - incr; i++)
+ incr += incr;
+ for (; i >= 0; incr = incr >> 1, i--) {
+ if (delta < incr)
continue;
- timer->it.cpu.expires.cpu =
- cputime_add(timer->it.cpu.expires.cpu, incr);
+ timer->it.cpu.expires.cpu += incr;
timer->it_overrun += 1 << i;
- delta = cputime_sub(delta, incr);
+ delta -= incr;
}
}
}
static inline cputime_t prof_ticks(struct task_struct *p)
{
- return cputime_add(p->utime, p->stime);
+ return p->utime + p->stime;
}
static inline cputime_t virt_ticks(struct task_struct *p)
{
@@ -248,8 +233,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
t = tsk;
do {
- times->utime = cputime_add(times->utime, t->utime);
- times->stime = cputime_add(times->stime, t->stime);
+ times->utime += t->utime;
+ times->stime += t->stime;
times->sum_exec_runtime += task_sched_runtime(t);
} while_each_thread(tsk, t);
out:
@@ -258,10 +243,10 @@ out:
static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
{
- if (cputime_gt(b->utime, a->utime))
+ if (b->utime > a->utime)
a->utime = b->utime;
- if (cputime_gt(b->stime, a->stime))
+ if (b->stime > a->stime)
a->stime = b->stime;
if (b->sum_exec_runtime > a->sum_exec_runtime)
@@ -306,7 +291,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
return -EINVAL;
case CPUCLOCK_PROF:
thread_group_cputime(p, &cputime);
- cpu->cpu = cputime_add(cputime.utime, cputime.stime);
+ cpu->cpu = cputime.utime + cputime.stime;
break;
case CPUCLOCK_VIRT:
thread_group_cputime(p, &cputime);
@@ -470,26 +455,24 @@ static void cleanup_timers(struct list_head *head,
unsigned long long sum_exec_runtime)
{
struct cpu_timer_list *timer, *next;
- cputime_t ptime = cputime_add(utime, stime);
+ cputime_t ptime = utime + stime;
list_for_each_entry_safe(timer, next, head, entry) {
list_del_init(&timer->entry);
- if (cputime_lt(timer->expires.cpu, ptime)) {
- timer->expires.cpu = cputime_zero;
+ if (timer->expires.cpu < ptime) {
+ timer->expires.cpu = 0;
} else {
- timer->expires.cpu = cputime_sub(timer->expires.cpu,
- ptime);
+ timer->expires.cpu -= ptime;
}
}
++head;
list_for_each_entry_safe(timer, next, head, entry) {
list_del_init(&timer->entry);
- if (cputime_lt(timer->expires.cpu, utime)) {
- timer->expires.cpu = cputime_zero;
+ if (timer->expires.cpu < utime) {
+ timer->expires.cpu = 0;
} else {
- timer->expires.cpu = cputime_sub(timer->expires.cpu,
- utime);
+ timer->expires.cpu -= utime;
}
}
@@ -520,8 +503,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
struct signal_struct *const sig = tsk->signal;
cleanup_timers(tsk->signal->cpu_timers,
- cputime_add(tsk->utime, sig->utime),
- cputime_add(tsk->stime, sig->stime),
+ tsk->utime + sig->utime, tsk->stime + sig->stime,
tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
}
@@ -540,8 +522,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
static inline int expires_gt(cputime_t expires, cputime_t new_exp)
{
- return cputime_eq(expires, cputime_zero) ||
- cputime_gt(expires, new_exp);
+ return expires == 0 || expires > new_exp;
}
/*
@@ -651,7 +632,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
default:
return -EINVAL;
case CPUCLOCK_PROF:
- cpu->cpu = cputime_add(cputime.utime, cputime.stime);
+ cpu->cpu = cputime.utime + cputime.stime;
break;
case CPUCLOCK_VIRT:
cpu->cpu = cputime.utime;
@@ -918,12 +899,12 @@ static void check_thread_timers(struct task_struct *tsk,
unsigned long soft;
maxfire = 20;
- tsk->cputime_expires.prof_exp = cputime_zero;
+ tsk->cputime_expires.prof_exp = 0;
while (!list_empty(timers)) {
struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
- if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+ if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
tsk->cputime_expires.prof_exp = t->expires.cpu;
break;
}
@@ -933,12 +914,12 @@ static void check_thread_timers(struct task_struct *tsk,
++timers;
maxfire = 20;
- tsk->cputime_expires.virt_exp = cputime_zero;
+ tsk->cputime_expires.virt_exp = 0;
while (!list_empty(timers)) {
struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
- if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+ if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
tsk->cputime_expires.virt_exp = t->expires.cpu;
break;
}
@@ -1009,20 +990,19 @@ static u32 onecputick;
static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
cputime_t *expires, cputime_t cur_time, int signo)
{
- if (cputime_eq(it->expires, cputime_zero))
+ if (!it->expires)
return;
- if (cputime_ge(cur_time, it->expires)) {
- if (!cputime_eq(it->incr, cputime_zero)) {
- it->expires = cputime_add(it->expires, it->incr);
+ if (cur_time >= it->expires) {
+ if (it->incr) {
+ it->expires += it->incr;
it->error += it->incr_error;
if (it->error >= onecputick) {
- it->expires = cputime_sub(it->expires,
- cputime_one_jiffy);
+ it->expires -= cputime_one_jiffy;
it->error -= onecputick;
}
} else {
- it->expires = cputime_zero;
+ it->expires = 0;
}
trace_itimer_expire(signo == SIGPROF ?
@@ -1031,9 +1011,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
__group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
}
- if (!cputime_eq(it->expires, cputime_zero) &&
- (cputime_eq(*expires, cputime_zero) ||
- cputime_lt(it->expires, *expires))) {
+ if (it->expires && (!*expires || it->expires < *expires)) {
*expires = it->expires;
}
}
@@ -1048,9 +1026,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
*/
static inline int task_cputime_zero(const struct task_cputime *cputime)
{
- if (cputime_eq(cputime->utime, cputime_zero) &&
- cputime_eq(cputime->stime, cputime_zero) &&
- cputime->sum_exec_runtime == 0)
+ if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
return 1;
return 0;
}
@@ -1076,15 +1052,15 @@ static void check_process_timers(struct task_struct *tsk,
*/
thread_group_cputimer(tsk, &cputime);
utime = cputime.utime;
- ptime = cputime_add(utime, cputime.stime);
+ ptime = utime + cputime.stime;
sum_sched_runtime = cputime.sum_exec_runtime;
maxfire = 20;
- prof_expires = cputime_zero;
+ prof_expires = 0;
while (!list_empty(timers)) {
struct cpu_timer_list *tl = list_first_entry(timers,
struct cpu_timer_list,
entry);
- if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) {
+ if (!--maxfire || ptime < tl->expires.cpu) {
prof_expires = tl->expires.cpu;
break;
}
@@ -1094,12 +1070,12 @@ static void check_process_timers(struct task_struct *tsk,
++timers;
maxfire = 20;
- virt_expires = cputime_zero;
+ virt_expires = 0;
while (!list_empty(timers)) {
struct cpu_timer_list *tl = list_first_entry(timers,
struct cpu_timer_list,
entry);
- if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) {
+ if (!--maxfire || utime < tl->expires.cpu) {
virt_expires = tl->expires.cpu;
break;
}
@@ -1154,8 +1130,7 @@ static void check_process_timers(struct task_struct *tsk,
}
}
x = secs_to_cputime(soft);
- if (cputime_eq(prof_expires, cputime_zero) ||
- cputime_lt(x, prof_expires)) {
+ if (!prof_expires || x < prof_expires) {
prof_expires = x;
}
}
@@ -1249,12 +1224,9 @@ out:
static inline int task_cputime_expired(const struct task_cputime *sample,
const struct task_cputime *expires)
{
- if (!cputime_eq(expires->utime, cputime_zero) &&
- cputime_ge(sample->utime, expires->utime))
+ if (expires->utime && sample->utime >= expires->utime)
return 1;
- if (!cputime_eq(expires->stime, cputime_zero) &&
- cputime_ge(cputime_add(sample->utime, sample->stime),
- expires->stime))
+ if (expires->stime && sample->utime + sample->stime >= expires->stime)
return 1;
if (expires->sum_exec_runtime != 0 &&
sample->sum_exec_runtime >= expires->sum_exec_runtime)
@@ -1389,18 +1361,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
* it to be relative, *newval argument is relative and we update
* it to be absolute.
*/
- if (!cputime_eq(*oldval, cputime_zero)) {
- if (cputime_le(*oldval, now.cpu)) {
+ if (*oldval) {
+ if (*oldval <= now.cpu) {
/* Just about to fire. */
*oldval = cputime_one_jiffy;
} else {
- *oldval = cputime_sub(*oldval, now.cpu);
+ *oldval -= now.cpu;
}
}
- if (cputime_eq(*newval, cputime_zero))
+ if (!*newval)
return;
- *newval = cputime_add(*newval, now.cpu);
+ *newval += now.cpu;
}
/*
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 196c01268eb..6d6d2887033 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -43,8 +43,6 @@ int in_suspend __nosavedata;
enum {
HIBERNATION_INVALID,
HIBERNATION_PLATFORM,
- HIBERNATION_TEST,
- HIBERNATION_TESTPROC,
HIBERNATION_SHUTDOWN,
HIBERNATION_REBOOT,
/* keep last */
@@ -55,7 +53,7 @@ enum {
static int hibernation_mode = HIBERNATION_SHUTDOWN;
-static bool freezer_test_done;
+bool freezer_test_done;
static const struct platform_hibernation_ops *hibernation_ops;
@@ -71,14 +69,14 @@ void hibernation_set_ops(const struct platform_hibernation_ops *ops)
WARN_ON(1);
return;
}
- mutex_lock(&pm_mutex);
+ lock_system_sleep();
hibernation_ops = ops;
if (ops)
hibernation_mode = HIBERNATION_PLATFORM;
else if (hibernation_mode == HIBERNATION_PLATFORM)
hibernation_mode = HIBERNATION_SHUTDOWN;
- mutex_unlock(&pm_mutex);
+ unlock_system_sleep();
}
static bool entering_platform_hibernation;
@@ -96,15 +94,6 @@ static void hibernation_debug_sleep(void)
mdelay(5000);
}
-static int hibernation_testmode(int mode)
-{
- if (hibernation_mode == mode) {
- hibernation_debug_sleep();
- return 1;
- }
- return 0;
-}
-
static int hibernation_test(int level)
{
if (pm_test_level == level) {
@@ -114,7 +103,6 @@ static int hibernation_test(int level)
return 0;
}
#else /* !CONFIG_PM_DEBUG */
-static int hibernation_testmode(int mode) { return 0; }
static int hibernation_test(int level) { return 0; }
#endif /* !CONFIG_PM_DEBUG */
@@ -278,8 +266,7 @@ static int create_image(int platform_mode)
goto Platform_finish;
error = disable_nonboot_cpus();
- if (error || hibernation_test(TEST_CPUS)
- || hibernation_testmode(HIBERNATION_TEST))
+ if (error || hibernation_test(TEST_CPUS))
goto Enable_cpus;
local_irq_disable();
@@ -333,7 +320,7 @@ static int create_image(int platform_mode)
*/
int hibernation_snapshot(int platform_mode)
{
- pm_message_t msg = PMSG_RECOVER;
+ pm_message_t msg;
int error;
error = platform_begin(platform_mode);
@@ -347,39 +334,40 @@ int hibernation_snapshot(int platform_mode)
error = freeze_kernel_threads();
if (error)
- goto Close;
+ goto Cleanup;
- if (hibernation_test(TEST_FREEZER) ||
- hibernation_testmode(HIBERNATION_TESTPROC)) {
+ if (hibernation_test(TEST_FREEZER)) {
/*
* Indicate to the caller that we are returning due to a
* successful freezer test.
*/
freezer_test_done = true;
- goto Close;
+ goto Cleanup;
}
error = dpm_prepare(PMSG_FREEZE);
- if (error)
- goto Complete_devices;
+ if (error) {
+ dpm_complete(PMSG_RECOVER);
+ goto Cleanup;
+ }
suspend_console();
pm_restrict_gfp_mask();
+
error = dpm_suspend(PMSG_FREEZE);
- if (error)
- goto Recover_platform;
- if (hibernation_test(TEST_DEVICES))
- goto Recover_platform;
+ if (error || hibernation_test(TEST_DEVICES))
+ platform_recover(platform_mode);
+ else
+ error = create_image(platform_mode);
- error = create_image(platform_mode);
/*
- * Control returns here (1) after the image has been created or the
+ * In the case that we call create_image() above, the control
+ * returns here (1) after the image has been created or the
* image creation has failed and (2) after a successful restore.
*/
- Resume_devices:
/* We may need to release the preallocated image pages here. */
if (error || !in_suspend)
swsusp_free();
@@ -391,17 +379,15 @@ int hibernation_snapshot(int platform_mode)
pm_restore_gfp_mask();
resume_console();
-
- Complete_devices:
dpm_complete(msg);
Close:
platform_end(platform_mode);
return error;
- Recover_platform:
- platform_recover(platform_mode);
- goto Resume_devices;
+ Cleanup:
+ swsusp_free();
+ goto Close;
}
/**
@@ -586,9 +572,6 @@ int hibernation_platform_enter(void)
static void power_down(void)
{
switch (hibernation_mode) {
- case HIBERNATION_TEST:
- case HIBERNATION_TESTPROC:
- break;
case HIBERNATION_REBOOT:
kernel_restart(NULL);
break;
@@ -607,17 +590,6 @@ static void power_down(void)
while(1);
}
-static int prepare_processes(void)
-{
- int error = 0;
-
- if (freeze_processes()) {
- error = -EBUSY;
- thaw_processes();
- }
- return error;
-}
-
/**
* hibernate - Carry out system hibernation, including saving the image.
*/
@@ -625,7 +597,7 @@ int hibernate(void)
{
int error;
- mutex_lock(&pm_mutex);
+ lock_system_sleep();
/* The snapshot device should not be opened while we're running */
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
error = -EBUSY;
@@ -650,7 +622,7 @@ int hibernate(void)
sys_sync();
printk("done.\n");
- error = prepare_processes();
+ error = freeze_processes();
if (error)
goto Finish;
@@ -693,7 +665,7 @@ int hibernate(void)
pm_restore_console();
atomic_inc(&snapshot_device_available);
Unlock:
- mutex_unlock(&pm_mutex);
+ unlock_system_sleep();
return error;
}
@@ -807,11 +779,13 @@ static int software_resume(void)
goto close_finish;
error = create_basic_memory_bitmaps();
- if (error)
+ if (error) {
+ usermodehelper_enable();
goto close_finish;
+ }
pr_debug("PM: Preparing processes for restore.\n");
- error = prepare_processes();
+ error = freeze_processes();
if (error) {
swsusp_close(FMODE_READ);
goto Done;
@@ -851,8 +825,6 @@ static const char * const hibernation_modes[] = {
[HIBERNATION_PLATFORM] = "platform",
[HIBERNATION_SHUTDOWN] = "shutdown",
[HIBERNATION_REBOOT] = "reboot",
- [HIBERNATION_TEST] = "test",
- [HIBERNATION_TESTPROC] = "testproc",
};
/*
@@ -861,17 +833,15 @@ static const char * const hibernation_modes[] = {
* Hibernation can be handled in several ways. There are a few different ways
* to put the system into the sleep state: using the platform driver (e.g. ACPI
* or other hibernation_ops), powering it off or rebooting it (for testing
- * mostly), or using one of the two available test modes.
+ * mostly).
*
* The sysfs file /sys/power/disk provides an interface for selecting the
* hibernation mode to use. Reading from this file causes the available modes
- * to be printed. There are 5 modes that can be supported:
+ * to be printed. There are 3 modes that can be supported:
*
* 'platform'
* 'shutdown'
* 'reboot'
- * 'test'
- * 'testproc'
*
* If a platform hibernation driver is in use, 'platform' will be supported
* and will be used by default. Otherwise, 'shutdown' will be used by default.
@@ -895,8 +865,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
switch (i) {
case HIBERNATION_SHUTDOWN:
case HIBERNATION_REBOOT:
- case HIBERNATION_TEST:
- case HIBERNATION_TESTPROC:
break;
case HIBERNATION_PLATFORM:
if (hibernation_ops)
@@ -925,7 +893,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- mutex_lock(&pm_mutex);
+ lock_system_sleep();
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (len == strlen(hibernation_modes[i])
&& !strncmp(buf, hibernation_modes[i], len)) {
@@ -937,8 +905,6 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
switch (mode) {
case HIBERNATION_SHUTDOWN:
case HIBERNATION_REBOOT:
- case HIBERNATION_TEST:
- case HIBERNATION_TESTPROC:
hibernation_mode = mode;
break;
case HIBERNATION_PLATFORM:
@@ -953,7 +919,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
if (!error)
pr_debug("PM: Hibernation mode set to '%s'\n",
hibernation_modes[mode]);
- mutex_unlock(&pm_mutex);
+ unlock_system_sleep();
return error ? error : n;
}
@@ -980,9 +946,9 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
if (maj != MAJOR(res) || min != MINOR(res))
goto out;
- mutex_lock(&pm_mutex);
+ lock_system_sleep();
swsusp_resume_device = res;
- mutex_unlock(&pm_mutex);
+ unlock_system_sleep();
printk(KERN_INFO "PM: Starting manual resume from disk\n");
noresume = 0;
software_resume();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 36e0f0903c3..9824b41e5a1 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -3,7 +3,7 @@
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
- *
+ *
* This file is released under the GPLv2
*
*/
@@ -116,7 +116,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- mutex_lock(&pm_mutex);
+ lock_system_sleep();
level = TEST_FIRST;
for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
@@ -126,7 +126,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
break;
}
- mutex_unlock(&pm_mutex);
+ unlock_system_sleep();
return error ? error : n;
}
@@ -240,7 +240,7 @@ struct kobject *power_kobj;
* 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
* 'disk' (Suspend-to-Disk).
*
- * store() accepts one of those strings, translates it into the
+ * store() accepts one of those strings, translates it into the
* proper enumerated value, and initiates a suspend transition.
*/
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -282,7 +282,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
/* First, check if we are requested to hibernate */
if (len == 4 && !strncmp(buf, "disk", len)) {
error = hibernate();
- goto Exit;
+ goto Exit;
}
#ifdef CONFIG_SUSPEND
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 23a2db1ec44..21724eee520 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -50,6 +50,8 @@ static inline char *check_image_kernel(struct swsusp_info *info)
#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
/* kernel/power/hibernate.c */
+extern bool freezer_test_done;
+
extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
extern int hibernation_platform_enter(void);
@@ -229,8 +231,28 @@ extern int pm_test_level;
#ifdef CONFIG_SUSPEND_FREEZER
static inline int suspend_freeze_processes(void)
{
- int error = freeze_processes();
- return error ? : freeze_kernel_threads();
+ int error;
+
+ error = freeze_processes();
+
+ /*
+ * freeze_processes() automatically thaws every task if freezing
+ * fails. So we need not do anything extra upon error.
+ */
+ if (error)
+ goto Finish;
+
+ error = freeze_kernel_threads();
+
+ /*
+ * freeze_kernel_threads() thaws only kernel threads upon freezing
+ * failure. So we have to thaw the userspace tasks ourselves.
+ */
+ if (error)
+ thaw_processes();
+
+ Finish:
+ return error;
}
static inline void suspend_thaw_processes(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index addbbe5531b..7e426459e60 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,16 +22,7 @@
*/
#define TIMEOUT (20 * HZ)
-static inline int freezable(struct task_struct * p)
-{
- if ((p == current) ||
- (p->flags & PF_NOFREEZE) ||
- (p->exit_state != 0))
- return 0;
- return 1;
-}
-
-static int try_to_freeze_tasks(bool sig_only)
+static int try_to_freeze_tasks(bool user_only)
{
struct task_struct *g, *p;
unsigned long end_time;
@@ -46,17 +37,14 @@ static int try_to_freeze_tasks(bool sig_only)
end_time = jiffies + TIMEOUT;
- if (!sig_only)
+ if (!user_only)
freeze_workqueues_begin();
while (true) {
todo = 0;
read_lock(&tasklist_lock);
do_each_thread(g, p) {
- if (frozen(p) || !freezable(p))
- continue;
-
- if (!freeze_task(p, sig_only))
+ if (p == current || !freeze_task(p))
continue;
/*
@@ -77,7 +65,7 @@ static int try_to_freeze_tasks(bool sig_only)
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
- if (!sig_only) {
+ if (!user_only) {
wq_busy = freeze_workqueues_busy();
todo += wq_busy;
}
@@ -103,11 +91,6 @@ static int try_to_freeze_tasks(bool sig_only)
elapsed_csecs = elapsed_csecs64;
if (todo) {
- /* This does not unfreeze processes that are already frozen
- * (we have slightly ugly calling convention in that respect,
- * and caller must call thaw_processes() if something fails),
- * but it cleans up leftover PF_FREEZE requests.
- */
printk("\n");
printk(KERN_ERR "Freezing of tasks %s after %d.%02d seconds "
"(%d tasks refusing to freeze, wq_busy=%d):\n",
@@ -115,15 +98,11 @@ static int try_to_freeze_tasks(bool sig_only)
elapsed_csecs / 100, elapsed_csecs % 100,
todo - wq_busy, wq_busy);
- thaw_workqueues();
-
read_lock(&tasklist_lock);
do_each_thread(g, p) {
- task_lock(p);
- if (!wakeup && freezing(p) && !freezer_should_skip(p))
+ if (!wakeup && !freezer_should_skip(p) &&
+ p != current && freezing(p) && !frozen(p))
sched_show_task(p);
- cancel_freezing(p);
- task_unlock(p);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
} else {
@@ -136,12 +115,18 @@ static int try_to_freeze_tasks(bool sig_only)
/**
* freeze_processes - Signal user space processes to enter the refrigerator.
+ *
+ * On success, returns 0. On failure, -errno and system is fully thawed.
*/
int freeze_processes(void)
{
int error;
+ if (!pm_freezing)
+ atomic_inc(&system_freezing_cnt);
+
printk("Freezing user space processes ... ");
+ pm_freezing = true;
error = try_to_freeze_tasks(true);
if (!error) {
printk("done.");
@@ -150,17 +135,25 @@ int freeze_processes(void)
printk("\n");
BUG_ON(in_atomic());
+ if (error)
+ thaw_processes();
return error;
}
/**
* freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
+ *
+ * On success, returns 0. On failure, -errno and only the kernel threads are
+ * thawed, so as to give a chance to the caller to do additional cleanups
+ * (if any) before thawing the userspace tasks. So, it is the responsibility
+ * of the caller to thaw the userspace tasks, when the time is right.
*/
int freeze_kernel_threads(void)
{
int error;
printk("Freezing remaining freezable tasks ... ");
+ pm_nosig_freezing = true;
error = try_to_freeze_tasks(false);
if (!error)
printk("done.");
@@ -168,38 +161,52 @@ int freeze_kernel_threads(void)
printk("\n");
BUG_ON(in_atomic());
+ if (error)
+ thaw_kernel_threads();
return error;
}
-static void thaw_tasks(bool nosig_only)
+void thaw_processes(void)
{
struct task_struct *g, *p;
- read_lock(&tasklist_lock);
- do_each_thread(g, p) {
- if (!freezable(p))
- continue;
+ if (pm_freezing)
+ atomic_dec(&system_freezing_cnt);
+ pm_freezing = false;
+ pm_nosig_freezing = false;
- if (nosig_only && should_send_signal(p))
- continue;
+ oom_killer_enable();
+
+ printk("Restarting tasks ... ");
- if (cgroup_freezing_or_frozen(p))
- continue;
+ thaw_workqueues();
- thaw_process(p);
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ __thaw_task(p);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
+
+ schedule();
+ printk("done.\n");
}
-void thaw_processes(void)
+void thaw_kernel_threads(void)
{
- oom_killer_enable();
+ struct task_struct *g, *p;
+
+ pm_nosig_freezing = false;
+ printk("Restarting kernel threads ... ");
- printk("Restarting tasks ... ");
thaw_workqueues();
- thaw_tasks(true);
- thaw_tasks(false);
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, p) {
+ if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
+ __thaw_task(p);
+ } while_each_thread(g, p);
+ read_unlock(&tasklist_lock);
+
schedule();
printk("done.\n");
}
-
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cbe2c144139..6a768e53700 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -812,7 +812,8 @@ unsigned int snapshot_additional_pages(struct zone *zone)
unsigned int res;
res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
- res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
+ res += DIV_ROUND_UP(res * sizeof(struct bm_block),
+ LINKED_PAGE_DATA_SIZE);
return 2 * res;
}
@@ -858,6 +859,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
PageReserved(page))
return NULL;
+ if (page_is_guard(page))
+ return NULL;
+
return page;
}
@@ -920,6 +924,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
&& (!kernel_page_present(page) || pfn_is_nosave(pfn)))
return NULL;
+ if (page_is_guard(page))
+ return NULL;
+
return page;
}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4953dc054c5..4fd51beed87 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -42,9 +42,9 @@ static const struct platform_suspend_ops *suspend_ops;
*/
void suspend_set_ops(const struct platform_suspend_ops *ops)
{
- mutex_lock(&pm_mutex);
+ lock_system_sleep();
suspend_ops = ops;
- mutex_unlock(&pm_mutex);
+ unlock_system_sleep();
}
EXPORT_SYMBOL_GPL(suspend_set_ops);
@@ -106,13 +106,11 @@ static int suspend_prepare(void)
goto Finish;
error = suspend_freeze_processes();
- if (error) {
- suspend_stats.failed_freeze++;
- dpm_save_failed_step(SUSPEND_FREEZE);
- } else
+ if (!error)
return 0;
- suspend_thaw_processes();
+ suspend_stats.failed_freeze++;
+ dpm_save_failed_step(SUSPEND_FREEZE);
usermodehelper_enable();
Finish:
pm_notifier_call_chain(PM_POST_SUSPEND);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 11a594c4ba2..8742fd013a9 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -18,7 +18,6 @@
#include <linux/bitops.h>
#include <linux/genhd.h>
#include <linux/device.h>
-#include <linux/buffer_head.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/swap.h>
@@ -774,8 +773,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
pr_debug("PM: Free swap pages: %u\n", free_swap);
- required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ?
- nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1);
+ required = PAGES_FOR_IO + nr_pages;
return free_swap > required;
}
@@ -803,10 +801,12 @@ int swsusp_write(unsigned int flags)
printk(KERN_ERR "PM: Cannot get swap writer\n");
return error;
}
- if (!enough_swap(pages, flags)) {
- printk(KERN_ERR "PM: Not enough free swap\n");
- error = -ENOSPC;
- goto out_finish;
+ if (flags & SF_NOCOMPRESS_MODE) {
+ if (!enough_swap(pages, flags)) {
+ printk(KERN_ERR "PM: Not enough free swap\n");
+ error = -ENOSPC;
+ goto out_finish;
+ }
}
memset(&snapshot, 0, sizeof(struct snapshot_handle));
error = snapshot_read_next(&snapshot);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6d8f535c2b8..3e100075b13 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -21,6 +21,7 @@
#include <linux/swapops.h>
#include <linux/pm.h>
#include <linux/fs.h>
+#include <linux/compat.h>
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
@@ -30,28 +31,6 @@
#include "power.h"
-/*
- * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and
- * will be removed in the future. They are only preserved here for
- * compatibility with existing userland utilities.
- */
-#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
-#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
-
-#define PMOPS_PREPARE 1
-#define PMOPS_ENTER 2
-#define PMOPS_FINISH 3
-
-/*
- * NOTE: The following ioctl definitions are wrong and have been replaced with
- * correct ones. They are only preserved here for compatibility with existing
- * userland utilities and will be removed in the future.
- */
-#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
-#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
-#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
-#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
-
#define SNAPSHOT_MINOR 231
@@ -71,7 +50,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
struct snapshot_data *data;
int error;
- mutex_lock(&pm_mutex);
+ lock_system_sleep();
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
error = -EBUSY;
@@ -123,7 +102,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
data->platform_support = 0;
Unlock:
- mutex_unlock(&pm_mutex);
+ unlock_system_sleep();
return error;
}
@@ -132,7 +111,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
- mutex_lock(&pm_mutex);
+ lock_system_sleep();
swsusp_free();
free_basic_memory_bitmaps();
@@ -146,7 +125,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
PM_POST_HIBERNATION : PM_POST_RESTORE);
atomic_inc(&snapshot_device_available);
- mutex_unlock(&pm_mutex);
+ unlock_system_sleep();
return 0;
}
@@ -158,7 +137,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
ssize_t res;
loff_t pg_offp = *offp & ~PAGE_MASK;
- mutex_lock(&pm_mutex);
+ lock_system_sleep();
data = filp->private_data;
if (!data->ready) {
@@ -179,7 +158,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
*offp += res;
Unlock:
- mutex_unlock(&pm_mutex);
+ unlock_system_sleep();
return res;
}
@@ -191,7 +170,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
ssize_t res;
loff_t pg_offp = *offp & ~PAGE_MASK;
- mutex_lock(&pm_mutex);
+ lock_system_sleep();
data = filp->private_data;
@@ -208,20 +187,11 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
if (res > 0)
*offp += res;
unlock:
- mutex_unlock(&pm_mutex);
+ unlock_system_sleep();
return res;
}
-static void snapshot_deprecated_ioctl(unsigned int cmd)
-{
- if (printk_ratelimit())
- printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
- "be removed soon, update your suspend-to-disk "
- "utilities\n",
- __builtin_return_address(0), cmd);
-}
-
static long snapshot_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg)
{
@@ -257,11 +227,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
break;
error = freeze_processes();
- if (error) {
- thaw_processes();
+ if (error)
usermodehelper_enable();
- }
- if (!error)
+ else
data->frozen = 1;
break;
@@ -274,8 +242,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
data->frozen = 0;
break;
- case SNAPSHOT_ATOMIC_SNAPSHOT:
- snapshot_deprecated_ioctl(cmd);
case SNAPSHOT_CREATE_IMAGE:
if (data->mode != O_RDONLY || !data->frozen || data->ready) {
error = -EPERM;
@@ -283,10 +249,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
}
pm_restore_gfp_mask();
error = hibernation_snapshot(data->platform_support);
- if (!error)
+ if (error) {
+ thaw_kernel_threads();
+ } else {
error = put_user(in_suspend, (int __user *)arg);
- if (!error)
- data->ready = 1;
+ if (!error && !freezer_test_done)
+ data->ready = 1;
+ if (freezer_test_done) {
+ freezer_test_done = false;
+ thaw_kernel_threads();
+ }
+ }
break;
case SNAPSHOT_ATOMIC_RESTORE:
@@ -303,10 +276,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
swsusp_free();
memset(&data->handle, 0, sizeof(struct snapshot_handle));
data->ready = 0;
+ /*
+ * It is necessary to thaw kernel threads here, because
+ * SNAPSHOT_CREATE_IMAGE may be invoked directly after
+ * SNAPSHOT_FREE. In that case, if kernel threads were not
+ * thawed, the preallocation of memory carried out by
+ * hibernation_snapshot() might run into problems (i.e. it
+ * might fail or even deadlock).
+ */
+ thaw_kernel_threads();
break;
- case SNAPSHOT_SET_IMAGE_SIZE:
- snapshot_deprecated_ioctl(cmd);
case SNAPSHOT_PREF_IMAGE_SIZE:
image_size = arg;
break;
@@ -321,16 +301,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
error = put_user(size, (loff_t __user *)arg);
break;
- case SNAPSHOT_AVAIL_SWAP:
- snapshot_deprecated_ioctl(cmd);
case SNAPSHOT_AVAIL_SWAP_SIZE:
size = count_swap_pages(data->swap, 1);
size <<= PAGE_SHIFT;
error = put_user(size, (loff_t __user *)arg);
break;
- case SNAPSHOT_GET_SWAP_PAGE:
- snapshot_deprecated_ioctl(cmd);
case SNAPSHOT_ALLOC_SWAP_PAGE:
if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
error = -ENODEV;
@@ -353,27 +329,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
free_all_swap_pages(data->swap);
break;
- case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
- snapshot_deprecated_ioctl(cmd);
- if (!swsusp_swap_in_use()) {
- /*
- * User space encodes device types as two-byte values,
- * so we need to recode them
- */
- if (old_decode_dev(arg)) {
- data->swap = swap_type_of(old_decode_dev(arg),
- 0, NULL);
- if (data->swap < 0)
- error = -ENODEV;
- } else {
- data->swap = -1;
- error = -EINVAL;
- }
- } else {
- error = -EPERM;
- }
- break;
-
case SNAPSHOT_S2RAM:
if (!data->frozen) {
error = -EPERM;
@@ -396,33 +351,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
error = hibernation_platform_enter();
break;
- case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
- snapshot_deprecated_ioctl(cmd);
- error = -EINVAL;
-
- switch (arg) {
-
- case PMOPS_PREPARE:
- data->platform_support = 1;
- error = 0;
- break;
-
- case PMOPS_ENTER:
- if (data->platform_support)
- error = hibernation_platform_enter();
- break;
-
- case PMOPS_FINISH:
- if (data->platform_support)
- error = 0;
- break;
-
- default:
- printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg);
-
- }
- break;
-
case SNAPSHOT_SET_SWAP_AREA:
if (swsusp_swap_in_use()) {
error = -EPERM;
@@ -464,6 +392,66 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
return error;
}
+#ifdef CONFIG_COMPAT
+
+struct compat_resume_swap_area {
+ compat_loff_t offset;
+ u32 dev;
+} __packed;
+
+static long
+snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t));
+
+ switch (cmd) {
+ case SNAPSHOT_GET_IMAGE_SIZE:
+ case SNAPSHOT_AVAIL_SWAP_SIZE:
+ case SNAPSHOT_ALLOC_SWAP_PAGE: {
+ compat_loff_t __user *uoffset = compat_ptr(arg);
+ loff_t offset;
+ mm_segment_t old_fs;
+ int err;
+
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = snapshot_ioctl(file, cmd, (unsigned long) &offset);
+ set_fs(old_fs);
+ if (!err && put_user(offset, uoffset))
+ err = -EFAULT;
+ return err;
+ }
+
+ case SNAPSHOT_CREATE_IMAGE:
+ return snapshot_ioctl(file, cmd,
+ (unsigned long) compat_ptr(arg));
+
+ case SNAPSHOT_SET_SWAP_AREA: {
+ struct compat_resume_swap_area __user *u_swap_area =
+ compat_ptr(arg);
+ struct resume_swap_area swap_area;
+ mm_segment_t old_fs;
+ int err;
+
+ err = get_user(swap_area.offset, &u_swap_area->offset);
+ err |= get_user(swap_area.dev, &u_swap_area->dev);
+ if (err)
+ return -EFAULT;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = snapshot_ioctl(file, SNAPSHOT_SET_SWAP_AREA,
+ (unsigned long) &swap_area);
+ set_fs(old_fs);
+ return err;
+ }
+
+ default:
+ return snapshot_ioctl(file, cmd, arg);
+ }
+}
+
+#endif /* CONFIG_COMPAT */
+
static const struct file_operations snapshot_fops = {
.open = snapshot_open,
.release = snapshot_release,
@@ -471,6 +459,9 @@ static const struct file_operations snapshot_fops = {
.write = snapshot_write,
.llseek = no_llseek,
.unlocked_ioctl = snapshot_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = snapshot_compat_ioctl,
+#endif
};
static struct miscdevice snapshot_device = {
diff --git a/kernel/printk.c b/kernel/printk.c
index 1455a0d4eed..13c0a1143f4 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -199,7 +199,7 @@ void __init setup_log_buf(int early)
unsigned long mem;
mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
- if (mem == MEMBLOCK_ERROR)
+ if (!mem)
return;
new_log_buf = __va(mem);
} else {
@@ -521,7 +521,7 @@ static void __call_console_drivers(unsigned start, unsigned end)
}
}
-static int __read_mostly ignore_loglevel;
+static bool __read_mostly ignore_loglevel;
static int __init ignore_loglevel_setup(char *str)
{
@@ -532,7 +532,7 @@ static int __init ignore_loglevel_setup(char *str)
}
early_param("ignore_loglevel", ignore_loglevel_setup);
-module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR);
+module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
"print all kernel messages to the console.");
@@ -688,6 +688,7 @@ static void zap_locks(void)
oops_timestamp = jiffies;
+ debug_locks_off();
/* If a crash is occurring, make sure we can't deadlock */
raw_spin_lock_init(&logbuf_lock);
/* And make sure that we print immediately */
@@ -695,9 +696,9 @@ static void zap_locks(void)
}
#if defined(CONFIG_PRINTK_TIME)
-static int printk_time = 1;
+static bool printk_time = 1;
#else
-static int printk_time = 0;
+static bool printk_time = 0;
#endif
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
@@ -840,9 +841,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
boot_delay_msec();
printk_delay();
- preempt_disable();
/* This stops the holder of console_sem just where we want him */
- raw_local_irq_save(flags);
+ local_irq_save(flags);
this_cpu = smp_processor_id();
/*
@@ -856,7 +856,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* recursion and return - but flag the recursion so that
* it can be printed at the next appropriate moment:
*/
- if (!oops_in_progress) {
+ if (!oops_in_progress && !lockdep_recursing(current)) {
recursion_bug = 1;
goto out_restore_irqs;
}
@@ -962,9 +962,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
lockdep_on();
out_restore_irqs:
- raw_local_irq_restore(flags);
+ local_irq_restore(flags);
- preempt_enable();
return printed_len;
}
EXPORT_SYMBOL(printk);
@@ -1099,7 +1098,7 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
return -1;
}
-int console_suspend_enabled = 1;
+bool console_suspend_enabled = 1;
EXPORT_SYMBOL(console_suspend_enabled);
static int __init console_suspend_disable(char *str)
@@ -1293,10 +1292,11 @@ again:
raw_spin_lock(&logbuf_lock);
if (con_start != log_end)
retry = 1;
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+
if (retry && console_trylock())
goto again;
- raw_spin_unlock_irqrestore(&logbuf_lock, flags);
if (wake_klogd)
wake_up_klogd();
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 24d04477b25..00ab2ca5ed1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -96,9 +96,20 @@ void __ptrace_unlink(struct task_struct *child)
*/
if (!(child->flags & PF_EXITING) &&
(child->signal->flags & SIGNAL_STOP_STOPPED ||
- child->signal->group_stop_count))
+ child->signal->group_stop_count)) {
child->jobctl |= JOBCTL_STOP_PENDING;
+ /*
+ * This is only possible if this thread was cloned by the
+ * traced task running in the stopped group, set the signal
+ * for the future reports.
+ * FIXME: we should change ptrace_init_task() to handle this
+ * case.
+ */
+ if (!(child->jobctl & JOBCTL_STOP_SIGMASK))
+ child->jobctl |= SIGSTOP;
+ }
+
/*
* If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
* @child in the butt. Note that @resume should be used iff @child
@@ -161,6 +172,14 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
return ret;
}
+static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode)
+{
+ if (mode & PTRACE_MODE_NOAUDIT)
+ return has_ns_capability_noaudit(current, ns, CAP_SYS_PTRACE);
+ else
+ return has_ns_capability(current, ns, CAP_SYS_PTRACE);
+}
+
int __ptrace_may_access(struct task_struct *task, unsigned int mode)
{
const struct cred *cred = current_cred(), *tcred;
@@ -187,7 +206,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
cred->gid == tcred->sgid &&
cred->gid == tcred->gid))
goto ok;
- if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+ if (ptrace_has_cap(tcred->user->user_ns, mode))
goto ok;
rcu_read_unlock();
return -EPERM;
@@ -196,7 +215,7 @@ ok:
smp_rmb();
if (task->mm)
dumpable = get_dumpable(task->mm);
- if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
+ if (!dumpable && !ptrace_has_cap(task_user_ns(task), mode))
return -EPERM;
return security_ptrace_access_check(task, mode);
@@ -266,7 +285,7 @@ static int ptrace_attach(struct task_struct *task, long request,
task->ptrace = PT_PTRACED;
if (seize)
task->ptrace |= PT_SEIZED;
- if (task_ns_capable(task, CAP_SYS_PTRACE))
+ if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
task->ptrace |= PT_PTRACE_CAP;
__ptrace_link(task, current);
diff --git a/kernel/rcu.h b/kernel/rcu.h
index f600868d550..aa88baab5f7 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -30,6 +30,13 @@
#endif /* #else #ifdef CONFIG_RCU_TRACE */
/*
+ * Process-level increment to ->dynticks_nesting field. This allows for
+ * architectures that use half-interrupts and half-exceptions from
+ * process context.
+ */
+#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1)
+
+/*
* debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
* by call_rcu() and rcu callback execution, and are therefore not part of the
* RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c5b98e565ae..2bc4e135ff2 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -93,6 +93,8 @@ int rcu_read_lock_bh_held(void)
{
if (!debug_lockdep_rcu_enabled())
return 1;
+ if (rcu_is_cpu_idle())
+ return 0;
return in_softirq() || irqs_disabled();
}
EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
@@ -316,3 +318,13 @@ struct debug_obj_descr rcuhead_debug_descr = {
};
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
+void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
+{
+ trace_rcu_torture_read(rcutorturename, rhp);
+}
+EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
+#else
+#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
+#endif
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 636af6d9c6e..977296dca0a 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -53,31 +53,137 @@ static void __call_rcu(struct rcu_head *head,
#include "rcutiny_plugin.h"
-#ifdef CONFIG_NO_HZ
+static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
-static long rcu_dynticks_nesting = 1;
+/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
+static void rcu_idle_enter_common(long long oldval)
+{
+ if (rcu_dynticks_nesting) {
+ RCU_TRACE(trace_rcu_dyntick("--=",
+ oldval, rcu_dynticks_nesting));
+ return;
+ }
+ RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting));
+ if (!is_idle_task(current)) {
+ struct task_struct *idle = idle_task(smp_processor_id());
+
+ RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
+ oldval, rcu_dynticks_nesting));
+ ftrace_dump(DUMP_ALL);
+ WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+ current->pid, current->comm,
+ idle->pid, idle->comm); /* must be idle task! */
+ }
+ rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
+}
/*
- * Enter dynticks-idle mode, which is an extended quiescent state
- * if we have fully entered that mode (i.e., if the new value of
- * dynticks_nesting is zero).
+ * Enter idle, which is an extended quiescent state if we have fully
+ * entered that mode (i.e., if the new value of dynticks_nesting is zero).
*/
-void rcu_enter_nohz(void)
+void rcu_idle_enter(void)
{
- if (--rcu_dynticks_nesting == 0)
- rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
+ unsigned long flags;
+ long long oldval;
+
+ local_irq_save(flags);
+ oldval = rcu_dynticks_nesting;
+ rcu_dynticks_nesting = 0;
+ rcu_idle_enter_common(oldval);
+ local_irq_restore(flags);
}
/*
- * Exit dynticks-idle mode, so that we are no longer in an extended
- * quiescent state.
+ * Exit an interrupt handler towards idle.
*/
-void rcu_exit_nohz(void)
+void rcu_irq_exit(void)
+{
+ unsigned long flags;
+ long long oldval;
+
+ local_irq_save(flags);
+ oldval = rcu_dynticks_nesting;
+ rcu_dynticks_nesting--;
+ WARN_ON_ONCE(rcu_dynticks_nesting < 0);
+ rcu_idle_enter_common(oldval);
+ local_irq_restore(flags);
+}
+
+/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
+static void rcu_idle_exit_common(long long oldval)
{
+ if (oldval) {
+ RCU_TRACE(trace_rcu_dyntick("++=",
+ oldval, rcu_dynticks_nesting));
+ return;
+ }
+ RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
+ if (!is_idle_task(current)) {
+ struct task_struct *idle = idle_task(smp_processor_id());
+
+ RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
+ oldval, rcu_dynticks_nesting));
+ ftrace_dump(DUMP_ALL);
+ WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+ current->pid, current->comm,
+ idle->pid, idle->comm); /* must be idle task! */
+ }
+}
+
+/*
+ * Exit idle, so that we are no longer in an extended quiescent state.
+ */
+void rcu_idle_exit(void)
+{
+ unsigned long flags;
+ long long oldval;
+
+ local_irq_save(flags);
+ oldval = rcu_dynticks_nesting;
+ WARN_ON_ONCE(oldval != 0);
+ rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
+ rcu_idle_exit_common(oldval);
+ local_irq_restore(flags);
+}
+
+/*
+ * Enter an interrupt handler, moving away from idle.
+ */
+void rcu_irq_enter(void)
+{
+ unsigned long flags;
+ long long oldval;
+
+ local_irq_save(flags);
+ oldval = rcu_dynticks_nesting;
rcu_dynticks_nesting++;
+ WARN_ON_ONCE(rcu_dynticks_nesting == 0);
+ rcu_idle_exit_common(oldval);
+ local_irq_restore(flags);
+}
+
+#ifdef CONFIG_PROVE_RCU
+
+/*
+ * Test whether RCU thinks that the current CPU is idle.
+ */
+int rcu_is_cpu_idle(void)
+{
+ return !rcu_dynticks_nesting;
}
+EXPORT_SYMBOL(rcu_is_cpu_idle);
+
+#endif /* #ifdef CONFIG_PROVE_RCU */
-#endif /* #ifdef CONFIG_NO_HZ */
+/*
+ * Test whether the current CPU was interrupted from idle. Nested
+ * interrupts don't count, we must be running at the first interrupt
+ * level.
+ */
+int rcu_is_cpu_rrupt_from_idle(void)
+{
+ return rcu_dynticks_nesting <= 0;
+}
/*
* Helper function for rcu_sched_qs() and rcu_bh_qs().
@@ -126,14 +232,13 @@ void rcu_bh_qs(int cpu)
/*
* Check to see if the scheduling-clock interrupt came from an extended
- * quiescent state, and, if so, tell RCU about it.
+ * quiescent state, and, if so, tell RCU about it. This function must
+ * be called from hardirq context. It is normally called from the
+ * scheduling-clock interrupt.
*/
void rcu_check_callbacks(int cpu, int user)
{
- if (user ||
- (idle_cpu(cpu) &&
- !in_softirq() &&
- hardirq_count() <= (1 << HARDIRQ_SHIFT)))
+ if (user || rcu_is_cpu_rrupt_from_idle())
rcu_sched_qs(cpu);
else if (!in_softirq())
rcu_bh_qs(cpu);
@@ -154,7 +259,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
/* If no RCU callbacks ready to invoke, just return. */
if (&rcp->rcucblist == rcp->donetail) {
RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
- RCU_TRACE(trace_rcu_batch_end(rcp->name, 0));
+ RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
+ ACCESS_ONCE(rcp->rcucblist),
+ need_resched(),
+ is_idle_task(current),
+ rcu_is_callbacks_kthread()));
return;
}
@@ -183,7 +292,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
RCU_TRACE(cb_count++);
}
RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
- RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count));
+ RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
+ is_idle_task(current),
+ rcu_is_callbacks_kthread()));
}
static void rcu_process_callbacks(struct softirq_action *unused)
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 2b0484a5dc2..9cb1ae4aabd 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -312,8 +312,8 @@ static int rcu_boost(void)
rt_mutex_lock(&mtx);
rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
- return rcu_preempt_ctrlblk.boost_tasks != NULL ||
- rcu_preempt_ctrlblk.exp_tasks != NULL;
+ return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
+ ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
}
/*
@@ -885,6 +885,19 @@ static void invoke_rcu_callbacks(void)
wake_up(&rcu_kthread_wq);
}
+#ifdef CONFIG_RCU_TRACE
+
+/*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(void)
+{
+ return rcu_kthread_task == current;
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
/*
* This kthread invokes RCU callbacks whose grace periods have
* elapsed. It is awakened as needed, and takes the place of the
@@ -938,6 +951,18 @@ void invoke_rcu_callbacks(void)
raise_softirq(RCU_SOFTIRQ);
}
+#ifdef CONFIG_RCU_TRACE
+
+/*
+ * There is no callback kthread, so this thread is never it.
+ */
+static bool rcu_is_callbacks_kthread(void)
+{
+ return false;
+}
+
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
void rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 764825c2685..a58ac285fc6 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -56,14 +56,16 @@ static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */
static int nfakewriters = 4; /* # fake writer threads */
static int stat_interval; /* Interval between stats, in seconds. */
/* Defaults to "only at end of test". */
-static int verbose; /* Print more debug info. */
-static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
+static bool verbose; /* Print more debug info. */
+static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
static int stutter = 5; /* Start/stop testing interval (in sec) */
static int irqreader = 1; /* RCU readers from irq (timers). */
-static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
-static int fqs_holdoff = 0; /* Hold time within burst (us). */
+static int fqs_duration; /* Duration of bursts (us), 0 to disable. */
+static int fqs_holdoff; /* Hold time within burst (us). */
static int fqs_stutter = 3; /* Wait time between bursts (s). */
+static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */
+static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */
static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -91,6 +93,10 @@ module_param(fqs_holdoff, int, 0444);
MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
module_param(fqs_stutter, int, 0444);
MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(onoff_interval, int, 0444);
+MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
+module_param(shutdown_secs, int, 0444);
+MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
module_param(test_boost, int, 0444);
MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
module_param(test_boost_interval, int, 0444);
@@ -119,6 +125,10 @@ static struct task_struct *shuffler_task;
static struct task_struct *stutter_task;
static struct task_struct *fqs_task;
static struct task_struct *boost_tasks[NR_CPUS];
+static struct task_struct *shutdown_task;
+#ifdef CONFIG_HOTPLUG_CPU
+static struct task_struct *onoff_task;
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
#define RCU_TORTURE_PIPE_LEN 10
@@ -149,6 +159,10 @@ static long n_rcu_torture_boost_rterror;
static long n_rcu_torture_boost_failure;
static long n_rcu_torture_boosts;
static long n_rcu_torture_timers;
+static long n_offline_attempts;
+static long n_offline_successes;
+static long n_online_attempts;
+static long n_online_successes;
static struct list_head rcu_torture_removed;
static cpumask_var_t shuffle_tmp_mask;
@@ -160,6 +174,8 @@ static int stutter_pause_test;
#define RCUTORTURE_RUNNABLE_INIT 0
#endif
int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+module_param(rcutorture_runnable, int, 0444);
+MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
#define rcu_can_boost() 1
@@ -167,6 +183,7 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
#define rcu_can_boost() 0
#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+static unsigned long shutdown_time; /* jiffies to system shutdown. */
static unsigned long boost_starttime; /* jiffies of next boost test start. */
DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
/* and boost task create/destroy. */
@@ -182,6 +199,9 @@ static int fullstop = FULLSTOP_RMMOD;
*/
static DEFINE_MUTEX(fullstop_mutex);
+/* Forward reference. */
+static void rcu_torture_cleanup(void);
+
/*
* Detect and respond to a system shutdown.
*/
@@ -612,6 +632,30 @@ static struct rcu_torture_ops srcu_ops = {
.name = "srcu"
};
+static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
+{
+ return srcu_read_lock_raw(&srcu_ctl);
+}
+
+static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
+{
+ srcu_read_unlock_raw(&srcu_ctl, idx);
+}
+
+static struct rcu_torture_ops srcu_raw_ops = {
+ .init = srcu_torture_init,
+ .cleanup = srcu_torture_cleanup,
+ .readlock = srcu_torture_read_lock_raw,
+ .read_delay = srcu_read_delay,
+ .readunlock = srcu_torture_read_unlock_raw,
+ .completed = srcu_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = srcu_torture_synchronize,
+ .cb_barrier = NULL,
+ .stats = srcu_torture_stats,
+ .name = "srcu_raw"
+};
+
static void srcu_torture_synchronize_expedited(void)
{
synchronize_srcu_expedited(&srcu_ctl);
@@ -913,6 +957,18 @@ rcu_torture_fakewriter(void *arg)
return 0;
}
+void rcutorture_trace_dump(void)
+{
+ static atomic_t beenhere = ATOMIC_INIT(0);
+
+ if (atomic_read(&beenhere))
+ return;
+ if (atomic_xchg(&beenhere, 1) != 0)
+ return;
+ do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
+ ftrace_dump(DUMP_ALL);
+}
+
/*
* RCU torture reader from timer handler. Dereferences rcu_torture_current,
* incrementing the corresponding element of the pipeline array. The
@@ -934,6 +990,7 @@ static void rcu_torture_timer(unsigned long unused)
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
srcu_read_lock_held(&srcu_ctl));
+ do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
if (p == NULL) {
/* Leave because rcu_torture_writer is not yet underway */
cur_ops->readunlock(idx);
@@ -951,6 +1008,8 @@ static void rcu_torture_timer(unsigned long unused)
/* Should not happen, but... */
pipe_count = RCU_TORTURE_PIPE_LEN;
}
+ if (pipe_count > 1)
+ rcutorture_trace_dump();
__this_cpu_inc(rcu_torture_count[pipe_count]);
completed = cur_ops->completed() - completed;
if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -994,6 +1053,7 @@ rcu_torture_reader(void *arg)
rcu_read_lock_bh_held() ||
rcu_read_lock_sched_held() ||
srcu_read_lock_held(&srcu_ctl));
+ do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
if (p == NULL) {
/* Wait for rcu_torture_writer to get underway */
cur_ops->readunlock(idx);
@@ -1009,6 +1069,8 @@ rcu_torture_reader(void *arg)
/* Should not happen, but... */
pipe_count = RCU_TORTURE_PIPE_LEN;
}
+ if (pipe_count > 1)
+ rcutorture_trace_dump();
__this_cpu_inc(rcu_torture_count[pipe_count]);
completed = cur_ops->completed() - completed;
if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1056,7 +1118,8 @@ rcu_torture_printk(char *page)
cnt += sprintf(&page[cnt],
"rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
"rtmbe: %d rtbke: %ld rtbre: %ld "
- "rtbf: %ld rtb: %ld nt: %ld",
+ "rtbf: %ld rtb: %ld nt: %ld "
+ "onoff: %ld/%ld:%ld/%ld",
rcu_torture_current,
rcu_torture_current_version,
list_empty(&rcu_torture_freelist),
@@ -1068,7 +1131,11 @@ rcu_torture_printk(char *page)
n_rcu_torture_boost_rterror,
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
- n_rcu_torture_timers);
+ n_rcu_torture_timers,
+ n_online_successes,
+ n_online_attempts,
+ n_offline_successes,
+ n_offline_attempts);
if (atomic_read(&n_rcu_torture_mberror) != 0 ||
n_rcu_torture_boost_ktrerror != 0 ||
n_rcu_torture_boost_rterror != 0 ||
@@ -1232,12 +1299,14 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
"shuffle_interval=%d stutter=%d irqreader=%d "
"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
"test_boost=%d/%d test_boost_interval=%d "
- "test_boost_duration=%d\n",
+ "test_boost_duration=%d shutdown_secs=%d "
+ "onoff_interval=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
test_boost, cur_ops->can_boost,
- test_boost_interval, test_boost_duration);
+ test_boost_interval, test_boost_duration, shutdown_secs,
+ onoff_interval);
}
static struct notifier_block rcutorture_shutdown_nb = {
@@ -1287,6 +1356,131 @@ static int rcutorture_booster_init(int cpu)
return 0;
}
+/*
+ * Cause the rcutorture test to shutdown the system after the test has
+ * run for the time specified by the shutdown_secs module parameter.
+ */
+static int
+rcu_torture_shutdown(void *arg)
+{
+ long delta;
+ unsigned long jiffies_snap;
+
+ VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
+ jiffies_snap = ACCESS_ONCE(jiffies);
+ while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
+ !kthread_should_stop()) {
+ delta = shutdown_time - jiffies_snap;
+ if (verbose)
+ printk(KERN_ALERT "%s" TORTURE_FLAG
+ "rcu_torture_shutdown task: %lu "
+ "jiffies remaining\n",
+ torture_type, delta);
+ schedule_timeout_interruptible(delta);
+ jiffies_snap = ACCESS_ONCE(jiffies);
+ }
+ if (kthread_should_stop()) {
+ VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
+ return 0;
+ }
+
+ /* OK, shut down the system. */
+
+ VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
+ shutdown_task = NULL; /* Avoid self-kill deadlock. */
+ rcu_torture_cleanup(); /* Get the success/failure message. */
+ kernel_power_off(); /* Shut down the system. */
+ return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Execute random CPU-hotplug operations at the interval specified
+ * by the onoff_interval.
+ */
+static int __cpuinit
+rcu_torture_onoff(void *arg)
+{
+ int cpu;
+ int maxcpu = -1;
+ DEFINE_RCU_RANDOM(rand);
+
+ VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
+ for_each_online_cpu(cpu)
+ maxcpu = cpu;
+ WARN_ON(maxcpu < 0);
+ while (!kthread_should_stop()) {
+ cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
+ if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
+ if (verbose)
+ printk(KERN_ALERT "%s" TORTURE_FLAG
+ "rcu_torture_onoff task: offlining %d\n",
+ torture_type, cpu);
+ n_offline_attempts++;
+ if (cpu_down(cpu) == 0) {
+ if (verbose)
+ printk(KERN_ALERT "%s" TORTURE_FLAG
+ "rcu_torture_onoff task: "
+ "offlined %d\n",
+ torture_type, cpu);
+ n_offline_successes++;
+ }
+ } else if (cpu_is_hotpluggable(cpu)) {
+ if (verbose)
+ printk(KERN_ALERT "%s" TORTURE_FLAG
+ "rcu_torture_onoff task: onlining %d\n",
+ torture_type, cpu);
+ n_online_attempts++;
+ if (cpu_up(cpu) == 0) {
+ if (verbose)
+ printk(KERN_ALERT "%s" TORTURE_FLAG
+ "rcu_torture_onoff task: "
+ "onlined %d\n",
+ torture_type, cpu);
+ n_online_successes++;
+ }
+ }
+ schedule_timeout_interruptible(onoff_interval * HZ);
+ }
+ VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
+ return 0;
+}
+
+static int __cpuinit
+rcu_torture_onoff_init(void)
+{
+ if (onoff_interval <= 0)
+ return 0;
+ onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
+ if (IS_ERR(onoff_task)) {
+ onoff_task = NULL;
+ return PTR_ERR(onoff_task);
+ }
+ return 0;
+}
+
+static void rcu_torture_onoff_cleanup(void)
+{
+ if (onoff_task == NULL)
+ return;
+ VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
+ kthread_stop(onoff_task);
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+static void
+rcu_torture_onoff_init(void)
+{
+}
+
+static void rcu_torture_onoff_cleanup(void)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
static int rcutorture_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
@@ -1391,6 +1585,11 @@ rcu_torture_cleanup(void)
for_each_possible_cpu(i)
rcutorture_booster_cleanup(i);
}
+ if (shutdown_task != NULL) {
+ VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
+ kthread_stop(shutdown_task);
+ }
+ rcu_torture_onoff_cleanup();
/* Wait for all RCU callbacks to fire. */
@@ -1416,7 +1615,7 @@ rcu_torture_init(void)
static struct rcu_torture_ops *torture_ops[] =
{ &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
&rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
- &srcu_ops, &srcu_expedited_ops,
+ &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops,
&sched_ops, &sched_sync_ops, &sched_expedited_ops, };
mutex_lock(&fullstop_mutex);
@@ -1607,6 +1806,18 @@ rcu_torture_init(void)
}
}
}
+ if (shutdown_secs > 0) {
+ shutdown_time = jiffies + shutdown_secs * HZ;
+ shutdown_task = kthread_run(rcu_torture_shutdown, NULL,
+ "rcu_torture_shutdown");
+ if (IS_ERR(shutdown_task)) {
+ firsterr = PTR_ERR(shutdown_task);
+ VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
+ shutdown_task = NULL;
+ goto unwind;
+ }
+ }
+ rcu_torture_onoff_init();
register_reboot_notifier(&rcutorture_shutdown_nb);
rcutorture_record_test_transition();
mutex_unlock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6b76d812740..6c4a6722abf 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -69,7 +69,7 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
NUM_RCU_LVL_3, \
NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
}, \
- .signaled = RCU_GP_IDLE, \
+ .fqs_state = RCU_GP_IDLE, \
.gpnum = -300, \
.completed = -300, \
.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
@@ -195,12 +195,10 @@ void rcu_note_context_switch(int cpu)
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
-#ifdef CONFIG_NO_HZ
DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
- .dynticks_nesting = 1,
+ .dynticks_nesting = DYNTICK_TASK_NESTING,
.dynticks = ATOMIC_INIT(1),
};
-#endif /* #ifdef CONFIG_NO_HZ */
static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
static int qhimark = 10000; /* If this many pending, ignore blimit. */
@@ -328,11 +326,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
return 1;
}
- /* If preemptible RCU, no point in sending reschedule IPI. */
- if (rdp->preemptible)
- return 0;
-
- /* The CPU is online, so send it a reschedule IPI. */
+ /*
+ * The CPU is online, so send it a reschedule IPI. This forces
+ * it through the scheduler, and (inefficiently) also handles cases
+ * where idle loops fail to inform RCU about the CPU being idle.
+ */
if (rdp->cpu != smp_processor_id())
smp_send_reschedule(rdp->cpu);
else
@@ -343,59 +341,181 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
#endif /* #ifdef CONFIG_SMP */
-#ifdef CONFIG_NO_HZ
+/*
+ * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
+ *
+ * If the new value of the ->dynticks_nesting counter now is zero,
+ * we really have entered idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
+{
+ trace_rcu_dyntick("Start", oldval, 0);
+ if (!is_idle_task(current)) {
+ struct task_struct *idle = idle_task(smp_processor_id());
+
+ trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
+ ftrace_dump(DUMP_ALL);
+ WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+ current->pid, current->comm,
+ idle->pid, idle->comm); /* must be idle task! */
+ }
+ rcu_prepare_for_idle(smp_processor_id());
+ /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+ smp_mb__before_atomic_inc(); /* See above. */
+ atomic_inc(&rdtp->dynticks);
+ smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+}
/**
- * rcu_enter_nohz - inform RCU that current CPU is entering nohz
+ * rcu_idle_enter - inform RCU that current CPU is entering idle
*
- * Enter nohz mode, in other words, -leave- the mode in which RCU
+ * Enter idle mode, in other words, -leave- the mode in which RCU
* read-side critical sections can occur. (Though RCU read-side
- * critical sections can occur in irq handlers in nohz mode, a possibility
- * handled by rcu_irq_enter() and rcu_irq_exit()).
+ * critical sections can occur in irq handlers in idle, a possibility
+ * handled by irq_enter() and irq_exit().)
+ *
+ * We crowbar the ->dynticks_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
*/
-void rcu_enter_nohz(void)
+void rcu_idle_enter(void)
{
unsigned long flags;
+ long long oldval;
struct rcu_dynticks *rdtp;
local_irq_save(flags);
rdtp = &__get_cpu_var(rcu_dynticks);
- if (--rdtp->dynticks_nesting) {
- local_irq_restore(flags);
- return;
- }
- trace_rcu_dyntick("Start");
- /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
- smp_mb__before_atomic_inc(); /* See above. */
- atomic_inc(&rdtp->dynticks);
- smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
- WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+ oldval = rdtp->dynticks_nesting;
+ rdtp->dynticks_nesting = 0;
+ rcu_idle_enter_common(rdtp, oldval);
local_irq_restore(flags);
}
-/*
- * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
+/**
+ * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ *
+ * Exit from an interrupt handler, which might possibly result in entering
+ * idle mode, in other words, leaving the mode in which read-side critical
+ * sections can occur.
*
- * Exit nohz mode, in other words, -enter- the mode in which RCU
- * read-side critical sections normally occur.
+ * This code assumes that the idle loop never does anything that might
+ * result in unbalanced calls to irq_enter() and irq_exit(). If your
+ * architecture violates this assumption, RCU will give you what you
+ * deserve, good and hard. But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
*/
-void rcu_exit_nohz(void)
+void rcu_irq_exit(void)
{
unsigned long flags;
+ long long oldval;
struct rcu_dynticks *rdtp;
local_irq_save(flags);
rdtp = &__get_cpu_var(rcu_dynticks);
- if (rdtp->dynticks_nesting++) {
- local_irq_restore(flags);
- return;
- }
+ oldval = rdtp->dynticks_nesting;
+ rdtp->dynticks_nesting--;
+ WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
+ if (rdtp->dynticks_nesting)
+ trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
+ else
+ rcu_idle_enter_common(rdtp, oldval);
+ local_irq_restore(flags);
+}
+
+/*
+ * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle
+ *
+ * If the new value of the ->dynticks_nesting counter was previously zero,
+ * we really have exited idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
+{
smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
atomic_inc(&rdtp->dynticks);
/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
smp_mb__after_atomic_inc(); /* See above. */
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
- trace_rcu_dyntick("End");
+ rcu_cleanup_after_idle(smp_processor_id());
+ trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
+ if (!is_idle_task(current)) {
+ struct task_struct *idle = idle_task(smp_processor_id());
+
+ trace_rcu_dyntick("Error on exit: not idle task",
+ oldval, rdtp->dynticks_nesting);
+ ftrace_dump(DUMP_ALL);
+ WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+ current->pid, current->comm,
+ idle->pid, idle->comm); /* must be idle task! */
+ }
+}
+
+/**
+ * rcu_idle_exit - inform RCU that current CPU is leaving idle
+ *
+ * Exit idle mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections can occur.
+ *
+ * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to
+ * allow for the possibility of usermode upcalls messing up our count
+ * of interrupt nesting level during the busy period that is just
+ * now starting.
+ */
+void rcu_idle_exit(void)
+{
+ unsigned long flags;
+ struct rcu_dynticks *rdtp;
+ long long oldval;
+
+ local_irq_save(flags);
+ rdtp = &__get_cpu_var(rcu_dynticks);
+ oldval = rdtp->dynticks_nesting;
+ WARN_ON_ONCE(oldval != 0);
+ rdtp->dynticks_nesting = DYNTICK_TASK_NESTING;
+ rcu_idle_exit_common(rdtp, oldval);
+ local_irq_restore(flags);
+}
+
+/**
+ * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
+ *
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur.
+ *
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to
+ * user mode! This code assumes that the idle loop never does upcalls to
+ * user mode. If your architecture does do upcalls from the idle loop (or
+ * does anything else that results in unbalanced calls to the irq_enter()
+ * and irq_exit() functions), RCU will give you what you deserve, good
+ * and hard. But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ */
+void rcu_irq_enter(void)
+{
+ unsigned long flags;
+ struct rcu_dynticks *rdtp;
+ long long oldval;
+
+ local_irq_save(flags);
+ rdtp = &__get_cpu_var(rcu_dynticks);
+ oldval = rdtp->dynticks_nesting;
+ rdtp->dynticks_nesting++;
+ WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
+ if (oldval)
+ trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
+ else
+ rcu_idle_exit_common(rdtp, oldval);
local_irq_restore(flags);
}
@@ -442,27 +562,37 @@ void rcu_nmi_exit(void)
WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
}
+#ifdef CONFIG_PROVE_RCU
+
/**
- * rcu_irq_enter - inform RCU of entry to hard irq context
+ * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
*
- * If the CPU was idle with dynamic ticks active, this updates the
- * rdtp->dynticks to let the RCU handling know that the CPU is active.
+ * If the current CPU is in its idle loop and is neither in an interrupt
+ * or NMI handler, return true.
*/
-void rcu_irq_enter(void)
+int rcu_is_cpu_idle(void)
{
- rcu_exit_nohz();
+ int ret;
+
+ preempt_disable();
+ ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
+ preempt_enable();
+ return ret;
}
+EXPORT_SYMBOL(rcu_is_cpu_idle);
+
+#endif /* #ifdef CONFIG_PROVE_RCU */
/**
- * rcu_irq_exit - inform RCU of exit from hard irq context
+ * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
*
- * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
- * to put let the RCU handling be aware that the CPU is going back to idle
- * with no ticks.
+ * If the current CPU is idle or running at a first-level (not nested)
+ * interrupt from idle, return true. The caller must have at least
+ * disabled preemption.
*/
-void rcu_irq_exit(void)
+int rcu_is_cpu_rrupt_from_idle(void)
{
- rcu_enter_nohz();
+ return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
}
#ifdef CONFIG_SMP
@@ -475,7 +605,7 @@ void rcu_irq_exit(void)
static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
- return 0;
+ return (rdp->dynticks_snap & 0x1) == 0;
}
/*
@@ -512,26 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
#endif /* #ifdef CONFIG_SMP */
-#else /* #ifdef CONFIG_NO_HZ */
-
-#ifdef CONFIG_SMP
-
-static int dyntick_save_progress_counter(struct rcu_data *rdp)
-{
- return 0;
-}
-
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
-{
- return rcu_implicit_offline_qs(rdp);
-}
-
-#endif /* #ifdef CONFIG_SMP */
-
-#endif /* #else #ifdef CONFIG_NO_HZ */
-
-int rcu_cpu_stall_suppress __read_mostly;
-
static void record_gp_stall_check_time(struct rcu_state *rsp)
{
rsp->gp_start = jiffies;
@@ -866,8 +976,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
/* Advance to a new grace period and initialize state. */
rsp->gpnum++;
trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
- WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
- rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
+ WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
+ rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
record_gp_stall_check_time(rsp);
@@ -877,7 +987,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
rnp->qsmask = rnp->qsmaskinit;
rnp->gpnum = rsp->gpnum;
rnp->completed = rsp->completed;
- rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
+ rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
rcu_start_gp_per_cpu(rsp, rnp, rdp);
rcu_preempt_boost_start_gp(rnp);
trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
@@ -927,7 +1037,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
rnp = rcu_get_root(rsp);
raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
+ rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
}
@@ -991,7 +1101,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
trace_rcu_grace_period(rsp->name, rsp->completed, "end");
- rsp->signaled = RCU_GP_IDLE;
+ rsp->fqs_state = RCU_GP_IDLE;
rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
}
@@ -1221,7 +1331,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
else
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (need_report & RCU_OFL_TASKS_EXP_GP)
- rcu_report_exp_rnp(rsp, rnp);
+ rcu_report_exp_rnp(rsp, rnp, true);
rcu_node_kthread_setaffinity(rnp, -1);
}
@@ -1263,7 +1373,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
/* If no callbacks are ready, just return.*/
if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
trace_rcu_batch_start(rsp->name, 0, 0);
- trace_rcu_batch_end(rsp->name, 0);
+ trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
+ need_resched(), is_idle_task(current),
+ rcu_is_callbacks_kthread());
return;
}
@@ -1291,12 +1403,17 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
debug_rcu_head_unqueue(list);
__rcu_reclaim(rsp->name, list);
list = next;
- if (++count >= bl)
+ /* Stop only if limit reached and CPU has something to do. */
+ if (++count >= bl &&
+ (need_resched() ||
+ (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
break;
}
local_irq_save(flags);
- trace_rcu_batch_end(rsp->name, count);
+ trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
+ is_idle_task(current),
+ rcu_is_callbacks_kthread());
/* Update count, and requeue any remaining callbacks. */
rdp->qlen -= count;
@@ -1334,16 +1451,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
* (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
* Also schedule RCU core processing.
*
- * This function must be called with hardirqs disabled. It is normally
+ * This function must be called from hardirq context. It is normally
* invoked from the scheduling-clock interrupt. If rcu_pending returns
* false, there is no point in invoking rcu_check_callbacks().
*/
void rcu_check_callbacks(int cpu, int user)
{
trace_rcu_utilization("Start scheduler-tick");
- if (user ||
- (idle_cpu(cpu) && rcu_scheduler_active &&
- !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+ if (user || rcu_is_cpu_rrupt_from_idle()) {
/*
* Get here if this CPU took its interrupt from user
@@ -1457,7 +1572,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
goto unlock_fqs_ret; /* no GP in progress, time updated. */
}
rsp->fqs_active = 1;
- switch (rsp->signaled) {
+ switch (rsp->fqs_state) {
case RCU_GP_IDLE:
case RCU_GP_INIT:
@@ -1473,7 +1588,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
force_qs_rnp(rsp, dyntick_save_progress_counter);
raw_spin_lock(&rnp->lock); /* irqs already disabled */
if (rcu_gp_in_progress(rsp))
- rsp->signaled = RCU_FORCE_QS;
+ rsp->fqs_state = RCU_FORCE_QS;
break;
case RCU_FORCE_QS:
@@ -1812,7 +1927,7 @@ static int rcu_pending(int cpu)
* by the current CPU, even if none need be done immediately, returning
* 1 if so.
*/
-static int rcu_needs_cpu_quick_check(int cpu)
+static int rcu_cpu_has_callbacks(int cpu)
{
/* RCU callbacks either ready or pending? */
return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1913,9 +2028,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
for (i = 0; i < RCU_NEXT_SIZE; i++)
rdp->nxttail[i] = &rdp->nxtlist;
rdp->qlen = 0;
-#ifdef CONFIG_NO_HZ
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
-#endif /* #ifdef CONFIG_NO_HZ */
+ WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING);
+ WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
rdp->cpu = cpu;
rdp->rsp = rsp;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1942,6 +2057,10 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
+ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING;
+ atomic_set(&rdp->dynticks->dynticks,
+ (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
+ rcu_prepare_for_idle_init(cpu);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
/*
@@ -2023,6 +2142,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
rcu_send_cbs_to_online(&rcu_bh_state);
rcu_send_cbs_to_online(&rcu_sched_state);
rcu_preempt_send_cbs_to_online();
+ rcu_cleanup_after_idle(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 849ce9ec51f..fddff92d667 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,9 +84,10 @@
* Dynticks per-CPU state.
*/
struct rcu_dynticks {
- int dynticks_nesting; /* Track irq/process nesting level. */
- int dynticks_nmi_nesting; /* Track NMI nesting level. */
- atomic_t dynticks; /* Even value for dynticks-idle, else odd. */
+ long long dynticks_nesting; /* Track irq/process nesting level. */
+ /* Process level is worth LLONG_MAX/2. */
+ int dynticks_nmi_nesting; /* Track NMI nesting level. */
+ atomic_t dynticks; /* Even value for idle, else odd. */
};
/* RCU's kthread states for tracing. */
@@ -274,16 +275,12 @@ struct rcu_data {
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
-#ifdef CONFIG_NO_HZ
/* 3) dynticks interface. */
struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
int dynticks_snap; /* Per-GP tracking for dynticks. */
-#endif /* #ifdef CONFIG_NO_HZ */
/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
-#ifdef CONFIG_NO_HZ
unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */
-#endif /* #ifdef CONFIG_NO_HZ */
unsigned long offline_fqs; /* Kicked due to being offline. */
unsigned long resched_ipi; /* Sent a resched IPI. */
@@ -302,16 +299,12 @@ struct rcu_data {
struct rcu_state *rsp;
};
-/* Values for signaled field in struct rcu_state. */
+/* Values for fqs_state field in struct rcu_state. */
#define RCU_GP_IDLE 0 /* No grace period in progress. */
#define RCU_GP_INIT 1 /* Grace period being initialized. */
#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
-#ifdef CONFIG_NO_HZ
#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
-#else /* #ifdef CONFIG_NO_HZ */
-#define RCU_SIGNAL_INIT RCU_FORCE_QS
-#endif /* #else #ifdef CONFIG_NO_HZ */
#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
@@ -361,7 +354,7 @@ struct rcu_state {
/* The following fields are guarded by the root rcu_node's lock. */
- u8 signaled ____cacheline_internodealigned_in_smp;
+ u8 fqs_state ____cacheline_internodealigned_in_smp;
/* Force QS state. */
u8 fqs_active; /* force_quiescent_state() */
/* is running. */
@@ -451,7 +444,8 @@ static void rcu_preempt_check_callbacks(int cpu);
static void rcu_preempt_process_callbacks(void);
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+ bool wake);
#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
static int rcu_preempt_pending(int cpu);
static int rcu_preempt_needs_cpu(int cpu);
@@ -461,6 +455,7 @@ static void __init __rcu_init_preempt(void);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static void invoke_rcu_callbacks_kthread(void);
+static bool rcu_is_callbacks_kthread(void);
#ifdef CONFIG_RCU_BOOST
static void rcu_preempt_do_callbacks(void);
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
@@ -473,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
#endif /* #ifdef CONFIG_RCU_BOOST */
static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
static void __cpuinit rcu_prepare_kthreads(int cpu);
+static void rcu_prepare_for_idle_init(int cpu);
+static void rcu_cleanup_after_idle(int cpu);
+static void rcu_prepare_for_idle(int cpu);
#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 4b9b9f8a418..8bb35d73e1f 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -312,6 +312,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
{
int empty;
int empty_exp;
+ int empty_exp_now;
unsigned long flags;
struct list_head *np;
#ifdef CONFIG_RCU_BOOST
@@ -382,8 +383,10 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
/*
* If this was the last task on the current list, and if
* we aren't waiting on any CPUs, report the quiescent state.
- * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
+ * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
+ * so we must take a snapshot of the expedited state.
*/
+ empty_exp_now = !rcu_preempted_readers_exp(rnp);
if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
trace_rcu_quiescent_state_report("preempt_rcu",
rnp->gpnum,
@@ -406,8 +409,8 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
* If this was the last task on the expedited lists,
* then we need to report up the rcu_node hierarchy.
*/
- if (!empty_exp && !rcu_preempted_readers_exp(rnp))
- rcu_report_exp_rnp(&rcu_preempt_state, rnp);
+ if (!empty_exp && empty_exp_now)
+ rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
} else {
local_irq_restore(flags);
}
@@ -729,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
* recursively up the tree. (Calm down, calm down, we do the recursion
* iteratively!)
*
+ * Most callers will set the "wake" flag, but the task initiating the
+ * expedited grace period need not wake itself.
+ *
* Caller must hold sync_rcu_preempt_exp_mutex.
*/
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+ bool wake)
{
unsigned long flags;
unsigned long mask;
@@ -744,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
}
if (rnp->parent == NULL) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- wake_up(&sync_rcu_preempt_exp_wq);
+ if (wake)
+ wake_up(&sync_rcu_preempt_exp_wq);
break;
}
mask = rnp->grpmask;
@@ -777,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
must_wait = 1;
}
if (!must_wait)
- rcu_report_exp_rnp(rsp, rnp);
+ rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
}
/*
@@ -1069,9 +1077,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
* report on tasks preempted in RCU read-side critical sections during
* expedited RCU grace periods.
*/
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+ bool wake)
{
- return;
}
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1157,8 +1165,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
#endif /* #else #ifdef CONFIG_RCU_TRACE */
-static struct lock_class_key rcu_boost_class;
-
/*
* Carry out RCU priority boosting on the task indicated by ->exp_tasks
* or ->boost_tasks, advancing the pointer to the next task in the
@@ -1221,15 +1227,13 @@ static int rcu_boost(struct rcu_node *rnp)
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
rt_mutex_init_proxy_locked(&mtx, t);
- /* Avoid lockdep false positives. This rt_mutex is its own thing. */
- lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class,
- "rcu_boost_mutex");
t->rcu_boost_mutex = &mtx;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
- return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
+ return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
+ ACCESS_ONCE(rnp->boost_tasks) != NULL;
}
/*
@@ -1329,6 +1333,15 @@ static void invoke_rcu_callbacks_kthread(void)
}
/*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(void)
+{
+ return __get_cpu_var(rcu_cpu_kthread_task) == current;
+}
+
+/*
* Set the affinity of the boost kthread. The CPU-hotplug locks are
* held, so no one should be messing with the existence of the boost
* kthread.
@@ -1772,6 +1785,11 @@ static void invoke_rcu_callbacks_kthread(void)
WARN_ON_ONCE(1);
}
+static bool rcu_is_callbacks_kthread(void)
+{
+ return false;
+}
+
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
{
}
@@ -1907,7 +1925,7 @@ void synchronize_sched_expedited(void)
* grace period works for us.
*/
get_online_cpus();
- snap = atomic_read(&sync_sched_expedited_started) - 1;
+ snap = atomic_read(&sync_sched_expedited_started);
smp_mb(); /* ensure read is before try_stop_cpus(). */
}
@@ -1939,88 +1957,243 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
* 1 if so. This function is part of the RCU implementation; it is -not-
* an exported member of the RCU API.
*
- * Because we have preemptible RCU, just check whether this CPU needs
- * any flavor of RCU. Do not chew up lots of CPU cycles with preemption
- * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
+ * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
+ * any flavor of RCU.
*/
int rcu_needs_cpu(int cpu)
{
- return rcu_needs_cpu_quick_check(cpu);
+ return rcu_cpu_has_callbacks(cpu);
+}
+
+/*
+ * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
+ */
+static void rcu_prepare_for_idle_init(int cpu)
+{
+}
+
+/*
+ * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
+ * after it.
+ */
+static void rcu_cleanup_after_idle(int cpu)
+{
+}
+
+/*
+ * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y,
+ * is nothing.
+ */
+static void rcu_prepare_for_idle(int cpu)
+{
}
#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#define RCU_NEEDS_CPU_FLUSHES 5
+/*
+ * This code is invoked when a CPU goes idle, at which point we want
+ * to have the CPU do everything required for RCU so that it can enter
+ * the energy-efficient dyntick-idle mode. This is handled by a
+ * state machine implemented by rcu_prepare_for_idle() below.
+ *
+ * The following three proprocessor symbols control this state machine:
+ *
+ * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
+ * to satisfy RCU. Beyond this point, it is better to incur a periodic
+ * scheduling-clock interrupt than to loop through the state machine
+ * at full power.
+ * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
+ * optional if RCU does not need anything immediately from this
+ * CPU, even if this CPU still has RCU callbacks queued. The first
+ * times through the state machine are mandatory: we need to give
+ * the state machine a chance to communicate a quiescent state
+ * to the RCU core.
+ * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
+ * to sleep in dyntick-idle mode with RCU callbacks pending. This
+ * is sized to be roughly one RCU grace period. Those energy-efficiency
+ * benchmarkers who might otherwise be tempted to set this to a large
+ * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
+ * system. And if you are -that- concerned about energy efficiency,
+ * just power the system down and be done with it!
+ *
+ * The values below work well in practice. If future workloads require
+ * adjustment, they can be converted into kernel config parameters, though
+ * making the state machine smarter might be a better option.
+ */
+#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
+#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
+#define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */
+
static DEFINE_PER_CPU(int, rcu_dyntick_drain);
static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
+static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
+static ktime_t rcu_idle_gp_wait;
/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so. This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
+ * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
+ * callbacks on this CPU, (2) this CPU has not yet attempted to enter
+ * dyntick-idle mode, or (3) this CPU is in the process of attempting to
+ * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
+ * to enter dyntick-idle mode, we refuse to try to enter it. After all,
+ * it is better to incur scheduling-clock interrupts than to spin
+ * continuously for the same time duration!
+ */
+int rcu_needs_cpu(int cpu)
+{
+ /* If no callbacks, RCU doesn't need the CPU. */
+ if (!rcu_cpu_has_callbacks(cpu))
+ return 0;
+ /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
+ return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
+}
+
+/*
+ * Timer handler used to force CPU to start pushing its remaining RCU
+ * callbacks in the case where it entered dyntick-idle mode with callbacks
+ * pending. The hander doesn't really need to do anything because the
+ * real work is done upon re-entry to idle, or by the next scheduling-clock
+ * interrupt should idle not be re-entered.
+ */
+static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
+{
+ trace_rcu_prep_idle("Timer");
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * Initialize the timer used to pull CPUs out of dyntick-idle mode.
+ */
+static void rcu_prepare_for_idle_init(int cpu)
+{
+ static int firsttime = 1;
+ struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
+
+ hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtp->function = rcu_idle_gp_timer_func;
+ if (firsttime) {
+ unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
+
+ rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
+ firsttime = 0;
+ }
+}
+
+/*
+ * Clean up for exit from idle. Because we are exiting from idle, there
+ * is no longer any point to rcu_idle_gp_timer, so cancel it. This will
+ * do nothing if this timer is not active, so just cancel it unconditionally.
+ */
+static void rcu_cleanup_after_idle(int cpu)
+{
+ hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
+}
+
+/*
+ * Check to see if any RCU-related work can be done by the current CPU,
+ * and if so, schedule a softirq to get it done. This function is part
+ * of the RCU implementation; it is -not- an exported member of the RCU API.
*
- * Because we are not supporting preemptible RCU, attempt to accelerate
- * any current grace periods so that RCU no longer needs this CPU, but
- * only if all other CPUs are already in dynticks-idle mode. This will
- * allow the CPU cores to be powered down immediately, as opposed to after
- * waiting many milliseconds for grace periods to elapse.
+ * The idea is for the current CPU to clear out all work required by the
+ * RCU core for the current grace period, so that this CPU can be permitted
+ * to enter dyntick-idle mode. In some cases, it will need to be awakened
+ * at the end of the grace period by whatever CPU ends the grace period.
+ * This allows CPUs to go dyntick-idle more quickly, and to reduce the
+ * number of wakeups by a modest integer factor.
*
* Because it is not legal to invoke rcu_process_callbacks() with irqs
* disabled, we do one pass of force_quiescent_state(), then do a
* invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
* later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ *
+ * The caller must have disabled interrupts.
*/
-int rcu_needs_cpu(int cpu)
+static void rcu_prepare_for_idle(int cpu)
{
- int c = 0;
- int snap;
- int thatcpu;
-
- /* Check for being in the holdoff period. */
- if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
- return rcu_needs_cpu_quick_check(cpu);
-
- /* Don't bother unless we are the last non-dyntick-idle CPU. */
- for_each_online_cpu(thatcpu) {
- if (thatcpu == cpu)
- continue;
- snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
- thatcpu).dynticks);
- smp_mb(); /* Order sampling of snap with end of grace period. */
- if ((snap & 0x1) != 0) {
- per_cpu(rcu_dyntick_drain, cpu) = 0;
- per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
- return rcu_needs_cpu_quick_check(cpu);
- }
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ /*
+ * If there are no callbacks on this CPU, enter dyntick-idle mode.
+ * Also reset state to avoid prejudicing later attempts.
+ */
+ if (!rcu_cpu_has_callbacks(cpu)) {
+ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+ per_cpu(rcu_dyntick_drain, cpu) = 0;
+ local_irq_restore(flags);
+ trace_rcu_prep_idle("No callbacks");
+ return;
+ }
+
+ /*
+ * If in holdoff mode, just return. We will presumably have
+ * refrained from disabling the scheduling-clock tick.
+ */
+ if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
+ local_irq_restore(flags);
+ trace_rcu_prep_idle("In holdoff");
+ return;
}
/* Check and update the rcu_dyntick_drain sequencing. */
if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
/* First time through, initialize the counter. */
- per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
+ per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
+ } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
+ !rcu_pending(cpu)) {
+ /* Can we go dyntick-idle despite still having callbacks? */
+ trace_rcu_prep_idle("Dyntick with callbacks");
+ per_cpu(rcu_dyntick_drain, cpu) = 0;
+ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+ hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
+ rcu_idle_gp_wait, HRTIMER_MODE_REL);
+ return; /* Nothing more to do immediately. */
} else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
/* We have hit the limit, so time to give up. */
per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
- return rcu_needs_cpu_quick_check(cpu);
+ local_irq_restore(flags);
+ trace_rcu_prep_idle("Begin holdoff");
+ invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
+ return;
}
- /* Do one step pushing remaining RCU callbacks through. */
+ /*
+ * Do one step of pushing the remaining RCU callbacks through
+ * the RCU core state machine.
+ */
+#ifdef CONFIG_TREE_PREEMPT_RCU
+ if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
+ local_irq_restore(flags);
+ rcu_preempt_qs(cpu);
+ force_quiescent_state(&rcu_preempt_state, 0);
+ local_irq_save(flags);
+ }
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
if (per_cpu(rcu_sched_data, cpu).nxtlist) {
+ local_irq_restore(flags);
rcu_sched_qs(cpu);
force_quiescent_state(&rcu_sched_state, 0);
- c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
+ local_irq_save(flags);
}
if (per_cpu(rcu_bh_data, cpu).nxtlist) {
+ local_irq_restore(flags);
rcu_bh_qs(cpu);
force_quiescent_state(&rcu_bh_state, 0);
- c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
+ local_irq_save(flags);
}
- /* If RCU callbacks are still pending, RCU still needs this CPU. */
- if (c)
+ /*
+ * If RCU callbacks are still pending, RCU still needs this CPU.
+ * So try forcing the callbacks through the grace period.
+ */
+ if (rcu_cpu_has_callbacks(cpu)) {
+ local_irq_restore(flags);
+ trace_rcu_prep_idle("More callbacks");
invoke_rcu_core();
- return c;
+ } else {
+ local_irq_restore(flags);
+ trace_rcu_prep_idle("Callbacks drained");
+ }
}
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9feffa4c069..654cfe67f0d 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -67,13 +67,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->completed, rdp->gpnum,
rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
rdp->qs_pending);
-#ifdef CONFIG_NO_HZ
- seq_printf(m, " dt=%d/%d/%d df=%lu",
+ seq_printf(m, " dt=%d/%llx/%d df=%lu",
atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
rdp->dynticks->dynticks_nmi_nesting,
rdp->dynticks_fqs);
-#endif /* #ifdef CONFIG_NO_HZ */
seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
seq_printf(m, " ql=%ld qs=%c%c%c%c",
rdp->qlen,
@@ -141,13 +139,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
rdp->completed, rdp->gpnum,
rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
rdp->qs_pending);
-#ifdef CONFIG_NO_HZ
- seq_printf(m, ",%d,%d,%d,%lu",
+ seq_printf(m, ",%d,%llx,%d,%lu",
atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
rdp->dynticks->dynticks_nmi_nesting,
rdp->dynticks_fqs);
-#endif /* #ifdef CONFIG_NO_HZ */
seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
@@ -171,9 +167,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
static int show_rcudata_csv(struct seq_file *m, void *unused)
{
seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
-#ifdef CONFIG_NO_HZ
seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
-#endif /* #ifdef CONFIG_NO_HZ */
seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
#ifdef CONFIG_RCU_BOOST
seq_puts(m, "\"kt\",\"ktl\"");
@@ -278,7 +272,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
gpnum = rsp->gpnum;
seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
"nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
- rsp->completed, gpnum, rsp->signaled,
+ rsp->completed, gpnum, rsp->fqs_state,
(long)(rsp->jiffies_force_qs - jiffies),
(int)(jiffies & 0xffff),
rsp->n_force_qs, rsp->n_force_qs_ngp,
diff --git a/kernel/relay.c b/kernel/relay.c
index 226fade4d72..ab56a1764d4 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -164,10 +164,14 @@ depopulate:
*/
static struct rchan_buf *relay_create_buf(struct rchan *chan)
{
- struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
- if (!buf)
+ struct rchan_buf *buf;
+
+ if (chan->n_subbufs > UINT_MAX / sizeof(size_t *))
return NULL;
+ buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL);
+ if (!buf)
+ return NULL;
buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
if (!buf->padding)
goto free_buf;
@@ -302,7 +306,7 @@ static void buf_unmapped_default_callback(struct rchan_buf *buf,
*/
static struct dentry *create_buf_file_default_callback(const char *filename,
struct dentry *parent,
- int mode,
+ umode_t mode,
struct rchan_buf *buf,
int *is_global)
{
@@ -574,6 +578,8 @@ struct rchan *relay_open(const char *base_filename,
if (!(subbuf_size && n_subbufs))
return NULL;
+ if (subbuf_size > UINT_MAX / n_subbufs)
+ return NULL;
chan = kzalloc(sizeof(struct rchan), GFP_KERNEL);
if (!chan)
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 34683efa2cc..d508363858b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -66,6 +66,31 @@ done:
return ret;
}
+int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
+ struct res_counter **limit_fail_at)
+{
+ int ret, r;
+ unsigned long flags;
+ struct res_counter *c;
+
+ r = ret = 0;
+ *limit_fail_at = NULL;
+ local_irq_save(flags);
+ for (c = counter; c != NULL; c = c->parent) {
+ spin_lock(&c->lock);
+ r = res_counter_charge_locked(c, val);
+ if (r)
+ c->usage += val;
+ spin_unlock(&c->lock);
+ if (r < 0 && ret == 0) {
+ *limit_fail_at = c;
+ ret = r;
+ }
+ }
+ local_irq_restore(flags);
+
+ return ret;
+}
void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
{
if (WARN_ON(counter->usage < val))
@@ -159,8 +184,7 @@ int res_counter_memparse_write_strategy(const char *buf,
return 0;
}
- /* FIXME - make memparse() take const char* args */
- *res = memparse((char *)buf, &end);
+ *res = memparse(buf, &end);
if (*end != '\0')
return -EINVAL;
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 8eafd1bd273..16502d3a71c 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -101,6 +101,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
printk("\n============================================\n");
printk( "[ BUG: circular locking deadlock detected! ]\n");
+ printk("%s\n", print_tainted());
printk( "--------------------------------------------\n");
printk("%s/%d is deadlocking current task %s/%d\n\n",
task->comm, task_pid_nr(task),
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 3d9f31cd79e..98ec4947546 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -6,11 +6,11 @@
* Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
*
*/
+#include <linux/device.h>
#include <linux/kthread.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
-#include <linux/sysdev.h>
#include <linux/timer.h>
#include <linux/freezer.h>
@@ -27,7 +27,7 @@ struct test_thread_data {
int opdata;
int mutexes[MAX_RT_TEST_MUTEXES];
int event;
- struct sys_device sysdev;
+ struct device dev;
};
static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
@@ -271,7 +271,7 @@ static int test_func(void *data)
*
* opcode:data
*/
-static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr,
+static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count)
{
struct sched_param schedpar;
@@ -279,8 +279,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
char cmdbuf[32];
int op, dat, tid, ret;
- td = container_of(dev, struct test_thread_data, sysdev);
- tid = td->sysdev.id;
+ td = container_of(dev, struct test_thread_data, dev);
+ tid = td->dev.id;
/* strings from sysfs write are not 0 terminated! */
if (count >= sizeof(cmdbuf))
@@ -334,7 +334,7 @@ static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribut
* @dev: thread to query
* @buf: char buffer to be filled with thread status info
*/
-static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr,
+static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct test_thread_data *td;
@@ -342,8 +342,8 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
char *curr = buf;
int i;
- td = container_of(dev, struct test_thread_data, sysdev);
- tsk = threads[td->sysdev.id];
+ td = container_of(dev, struct test_thread_data, dev);
+ tsk = threads[td->dev.id];
spin_lock(&rttest_lock);
@@ -360,28 +360,29 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
spin_unlock(&rttest_lock);
curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
- mutexes[td->sysdev.id].owner);
+ mutexes[td->dev.id].owner);
return curr - buf;
}
-static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
-static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
+static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
+static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
-static struct sysdev_class rttest_sysclass = {
+static struct bus_type rttest_subsys = {
.name = "rttest",
+ .dev_name = "rttest",
};
static int init_test_thread(int id)
{
- thread_data[id].sysdev.cls = &rttest_sysclass;
- thread_data[id].sysdev.id = id;
+ thread_data[id].dev.bus = &rttest_subsys;
+ thread_data[id].dev.id = id;
threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
if (IS_ERR(threads[id]))
return PTR_ERR(threads[id]);
- return sysdev_register(&thread_data[id].sysdev);
+ return device_register(&thread_data[id].dev);
}
static int init_rttest(void)
@@ -393,7 +394,7 @@ static int init_rttest(void)
for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
rt_mutex_init(&mutexes[i]);
- ret = sysdev_class_register(&rttest_sysclass);
+ ret = subsys_system_register(&rttest_subsys, NULL);
if (ret)
return ret;
@@ -401,10 +402,10 @@ static int init_rttest(void)
ret = init_test_thread(i);
if (ret)
break;
- ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
+ ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
if (ret)
break;
- ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
+ ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
if (ret)
break;
}
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index f9d8482dd48..a242e691c99 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -579,7 +579,6 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
struct rt_mutex_waiter *waiter)
{
int ret = 0;
- int was_disabled;
for (;;) {
/* Try to acquire the lock: */
@@ -602,17 +601,10 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
raw_spin_unlock(&lock->wait_lock);
- was_disabled = irqs_disabled();
- if (was_disabled)
- local_irq_enable();
-
debug_rt_mutex_print_deadlock(waiter);
schedule_rt_mutex(lock);
- if (was_disabled)
- local_irq_disable();
-
raw_spin_lock(&lock->wait_lock);
set_current_state(state);
}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
new file mode 100644
index 00000000000..9a7dd35102a
--- /dev/null
+++ b/kernel/sched/Makefile
@@ -0,0 +1,20 @@
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_clock.o = -pg
+endif
+
+ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
+# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
+# needed for x86 only. Why this used to be enabled for all architectures is beyond
+# me. I suspect most platforms don't need this, but until we know that for sure
+# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k
+# to get a correct value for the wait-channel (WCHAN in ps). --davidm
+CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
+endif
+
+obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
+obj-$(CONFIG_SMP) += cpupri.o
+obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
+obj-$(CONFIG_SCHEDSTATS) += stats.o
+obj-$(CONFIG_SCHED_DEBUG) += debug.o
+
+
diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c
index 429242f3c48..e8a1f83ee0e 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched/auto_group.c
@@ -1,15 +1,19 @@
#ifdef CONFIG_SCHED_AUTOGROUP
+#include "sched.h"
+
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kallsyms.h>
#include <linux/utsname.h>
+#include <linux/security.h>
+#include <linux/export.h>
unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
-static void __init autogroup_init(struct task_struct *init_task)
+void __init autogroup_init(struct task_struct *init_task)
{
autogroup_default.tg = &root_task_group;
kref_init(&autogroup_default.kref);
@@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task)
init_task->signal->autogroup = &autogroup_default;
}
-static inline void autogroup_free(struct task_group *tg)
+void autogroup_free(struct task_group *tg)
{
kfree(tg->autogroup);
}
@@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
return ag;
}
-#ifdef CONFIG_RT_GROUP_SCHED
-static void free_rt_sched_group(struct task_group *tg);
-#endif
-
static inline struct autogroup *autogroup_create(void)
{
struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -108,8 +108,7 @@ out_fail:
return autogroup_kref_get(&autogroup_default);
}
-static inline bool
-task_wants_autogroup(struct task_struct *p, struct task_group *tg)
+bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
{
if (tg != &root_task_group)
return false;
@@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
return true;
}
-static inline bool task_group_is_autogroup(struct task_group *tg)
-{
- return !!tg->autogroup;
-}
-
-static inline struct task_group *
-autogroup_task_group(struct task_struct *p, struct task_group *tg)
-{
- int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
-
- if (enabled && task_wants_autogroup(p, tg))
- return p->signal->autogroup->tg;
-
- return tg;
-}
-
static void
autogroup_move_group(struct task_struct *p, struct autogroup *ag)
{
@@ -263,7 +246,7 @@ out:
#endif /* CONFIG_PROC_FS */
#ifdef CONFIG_SCHED_DEBUG
-static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
if (!task_group_is_autogroup(tg))
return 0;
diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h
index c2f0e7248dc..8bd04714281 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched/auto_group.h
@@ -1,5 +1,8 @@
#ifdef CONFIG_SCHED_AUTOGROUP
+#include <linux/kref.h>
+#include <linux/rwsem.h>
+
struct autogroup {
/*
* reference doesn't mean how many thread attach to this
@@ -13,9 +16,28 @@ struct autogroup {
int nice;
};
-static inline bool task_group_is_autogroup(struct task_group *tg);
+extern void autogroup_init(struct task_struct *init_task);
+extern void autogroup_free(struct task_group *tg);
+
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+ return !!tg->autogroup;
+}
+
+extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
+
static inline struct task_group *
-autogroup_task_group(struct task_struct *p, struct task_group *tg);
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+ int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+
+ if (enabled && task_wants_autogroup(p, tg))
+ return p->signal->autogroup->tg;
+
+ return tg;
+}
+
+extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
#else /* !CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c
index c685e31492d..c685e31492d 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched/clock.c
diff --git a/kernel/sched.c b/kernel/sched/core.c
index 0e9344a71be..33a0676ea74 100644
--- a/kernel/sched.c
+++ b/kernel/sched/core.c
@@ -1,5 +1,5 @@
/*
- * kernel/sched.c
+ * kernel/sched/core.c
*
* Kernel scheduler and related syscalls
*
@@ -56,7 +56,6 @@
#include <linux/percpu.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <linux/stop_machine.h>
#include <linux/sysctl.h>
#include <linux/syscalls.h>
#include <linux/times.h>
@@ -71,6 +70,7 @@
#include <linux/ctype.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
+#include <linux/init_task.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
@@ -79,124 +79,13 @@
#include <asm/paravirt.h>
#endif
-#include "sched_cpupri.h"
-#include "workqueue_sched.h"
-#include "sched_autogroup.h"
+#include "sched.h"
+#include "../workqueue_sched.h"
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
-/*
- * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- * and back.
- */
-#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
-
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
-
-/*
- * Helpers for converting nanosecond timing to jiffy resolution
- */
-#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
-
-#define NICE_0_LOAD SCHED_LOAD_SCALE
-#define NICE_0_SHIFT SCHED_LOAD_SHIFT
-
-/*
- * These are the 'tuning knobs' of the scheduler:
- *
- * default timeslice is 100 msecs (used only for SCHED_RR tasks).
- * Timeslices get refilled after they expire.
- */
-#define DEF_TIMESLICE (100 * HZ / 1000)
-
-/*
- * single value that denotes runtime == period, ie unlimited time.
- */
-#define RUNTIME_INF ((u64)~0ULL)
-
-static inline int rt_policy(int policy)
-{
- if (policy == SCHED_FIFO || policy == SCHED_RR)
- return 1;
- return 0;
-}
-
-static inline int task_has_rt_policy(struct task_struct *p)
-{
- return rt_policy(p->policy);
-}
-
-/*
- * This is the priority-queue data structure of the RT scheduling class:
- */
-struct rt_prio_array {
- DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
- struct list_head queue[MAX_RT_PRIO];
-};
-
-struct rt_bandwidth {
- /* nests inside the rq lock: */
- raw_spinlock_t rt_runtime_lock;
- ktime_t rt_period;
- u64 rt_runtime;
- struct hrtimer rt_period_timer;
-};
-
-static struct rt_bandwidth def_rt_bandwidth;
-
-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
-
-static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
-{
- struct rt_bandwidth *rt_b =
- container_of(timer, struct rt_bandwidth, rt_period_timer);
- ktime_t now;
- int overrun;
- int idle = 0;
-
- for (;;) {
- now = hrtimer_cb_get_time(timer);
- overrun = hrtimer_forward(timer, now, rt_b->rt_period);
-
- if (!overrun)
- break;
-
- idle = do_sched_rt_period_timer(rt_b, overrun);
- }
-
- return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
-}
-
-static
-void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
-{
- rt_b->rt_period = ns_to_ktime(period);
- rt_b->rt_runtime = runtime;
-
- raw_spin_lock_init(&rt_b->rt_runtime_lock);
-
- hrtimer_init(&rt_b->rt_period_timer,
- CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- rt_b->rt_period_timer.function = sched_rt_period_timer;
-}
-
-static inline int rt_bandwidth_enabled(void)
-{
- return sysctl_sched_rt_runtime >= 0;
-}
-
-static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
+void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
{
unsigned long delta;
ktime_t soft, hard, now;
@@ -216,580 +105,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
}
}
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
-{
- if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
- return;
-
- if (hrtimer_active(&rt_b->rt_period_timer))
- return;
-
- raw_spin_lock(&rt_b->rt_runtime_lock);
- start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
- raw_spin_unlock(&rt_b->rt_runtime_lock);
-}
-
-#ifdef CONFIG_RT_GROUP_SCHED
-static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
-{
- hrtimer_cancel(&rt_b->rt_period_timer);
-}
-#endif
-
-/*
- * sched_domains_mutex serializes calls to init_sched_domains,
- * detach_destroy_domains and partition_sched_domains.
- */
-static DEFINE_MUTEX(sched_domains_mutex);
-
-#ifdef CONFIG_CGROUP_SCHED
-
-#include <linux/cgroup.h>
-
-struct cfs_rq;
-
-static LIST_HEAD(task_groups);
-
-struct cfs_bandwidth {
-#ifdef CONFIG_CFS_BANDWIDTH
- raw_spinlock_t lock;
- ktime_t period;
- u64 quota, runtime;
- s64 hierarchal_quota;
- u64 runtime_expires;
-
- int idle, timer_active;
- struct hrtimer period_timer, slack_timer;
- struct list_head throttled_cfs_rq;
-
- /* statistics */
- int nr_periods, nr_throttled;
- u64 throttled_time;
-#endif
-};
-
-/* task group related information */
-struct task_group {
- struct cgroup_subsys_state css;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /* schedulable entities of this group on each cpu */
- struct sched_entity **se;
- /* runqueue "owned" by this group on each cpu */
- struct cfs_rq **cfs_rq;
- unsigned long shares;
-
- atomic_t load_weight;
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
- struct sched_rt_entity **rt_se;
- struct rt_rq **rt_rq;
-
- struct rt_bandwidth rt_bandwidth;
-#endif
-
- struct rcu_head rcu;
- struct list_head list;
-
- struct task_group *parent;
- struct list_head siblings;
- struct list_head children;
-
-#ifdef CONFIG_SCHED_AUTOGROUP
- struct autogroup *autogroup;
-#endif
-
- struct cfs_bandwidth cfs_bandwidth;
-};
-
-/* task_group_lock serializes the addition/removal of task groups */
-static DEFINE_SPINLOCK(task_group_lock);
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
-
-/*
- * A weight of 0 or 1 can cause arithmetics problems.
- * A weight of a cfs_rq is the sum of weights of which entities
- * are queued on this cfs_rq, so a weight of a entity should not be
- * too large, so as the shares value of a task group.
- * (The default weight is 1024 - so there's no practical
- * limitation from this.)
- */
-#define MIN_SHARES (1UL << 1)
-#define MAX_SHARES (1UL << 18)
-
-static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
-#endif
-
-/* Default task group.
- * Every task in system belong to this group at bootup.
- */
-struct task_group root_task_group;
-
-#endif /* CONFIG_CGROUP_SCHED */
-
-/* CFS-related fields in a runqueue */
-struct cfs_rq {
- struct load_weight load;
- unsigned long nr_running, h_nr_running;
-
- u64 exec_clock;
- u64 min_vruntime;
-#ifndef CONFIG_64BIT
- u64 min_vruntime_copy;
-#endif
-
- struct rb_root tasks_timeline;
- struct rb_node *rb_leftmost;
-
- struct list_head tasks;
- struct list_head *balance_iterator;
-
- /*
- * 'curr' points to currently running entity on this cfs_rq.
- * It is set to NULL otherwise (i.e when none are currently running).
- */
- struct sched_entity *curr, *next, *last, *skip;
-
-#ifdef CONFIG_SCHED_DEBUG
- unsigned int nr_spread_over;
-#endif
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
- struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
-
- /*
- * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
- * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
- * (like users, containers etc.)
- *
- * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
- * list is used during load balance.
- */
- int on_list;
- struct list_head leaf_cfs_rq_list;
- struct task_group *tg; /* group that "owns" this runqueue */
-
-#ifdef CONFIG_SMP
- /*
- * the part of load.weight contributed by tasks
- */
- unsigned long task_weight;
-
- /*
- * h_load = weight * f(tg)
- *
- * Where f(tg) is the recursive weight fraction assigned to
- * this group.
- */
- unsigned long h_load;
-
- /*
- * Maintaining per-cpu shares distribution for group scheduling
- *
- * load_stamp is the last time we updated the load average
- * load_last is the last time we updated the load average and saw load
- * load_unacc_exec_time is currently unaccounted execution time
- */
- u64 load_avg;
- u64 load_period;
- u64 load_stamp, load_last, load_unacc_exec_time;
-
- unsigned long load_contribution;
-#endif
-#ifdef CONFIG_CFS_BANDWIDTH
- int runtime_enabled;
- u64 runtime_expires;
- s64 runtime_remaining;
-
- u64 throttled_timestamp;
- int throttled, throttle_count;
- struct list_head throttled_list;
-#endif
-#endif
-};
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_CFS_BANDWIDTH
-static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
-{
- return &tg->cfs_bandwidth;
-}
-
-static inline u64 default_cfs_period(void);
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
-static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
-
-static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
-{
- struct cfs_bandwidth *cfs_b =
- container_of(timer, struct cfs_bandwidth, slack_timer);
- do_sched_cfs_slack_timer(cfs_b);
-
- return HRTIMER_NORESTART;
-}
-
-static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
-{
- struct cfs_bandwidth *cfs_b =
- container_of(timer, struct cfs_bandwidth, period_timer);
- ktime_t now;
- int overrun;
- int idle = 0;
-
- for (;;) {
- now = hrtimer_cb_get_time(timer);
- overrun = hrtimer_forward(timer, now, cfs_b->period);
-
- if (!overrun)
- break;
-
- idle = do_sched_cfs_period_timer(cfs_b, overrun);
- }
-
- return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
-}
-
-static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{
- raw_spin_lock_init(&cfs_b->lock);
- cfs_b->runtime = 0;
- cfs_b->quota = RUNTIME_INF;
- cfs_b->period = ns_to_ktime(default_cfs_period());
-
- INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
- hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- cfs_b->period_timer.function = sched_cfs_period_timer;
- hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- cfs_b->slack_timer.function = sched_cfs_slack_timer;
-}
-
-static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
- cfs_rq->runtime_enabled = 0;
- INIT_LIST_HEAD(&cfs_rq->throttled_list);
-}
-
-/* requires cfs_b->lock, may release to reprogram timer */
-static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{
- /*
- * The timer may be active because we're trying to set a new bandwidth
- * period or because we're racing with the tear-down path
- * (timer_active==0 becomes visible before the hrtimer call-back
- * terminates). In either case we ensure that it's re-programmed
- */
- while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
- raw_spin_unlock(&cfs_b->lock);
- /* ensure cfs_b->lock is available while we wait */
- hrtimer_cancel(&cfs_b->period_timer);
-
- raw_spin_lock(&cfs_b->lock);
- /* if someone else restarted the timer then we're done */
- if (cfs_b->timer_active)
- return;
- }
-
- cfs_b->timer_active = 1;
- start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
-}
-
-static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{
- hrtimer_cancel(&cfs_b->period_timer);
- hrtimer_cancel(&cfs_b->slack_timer);
-}
-#else
-static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
-static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-
-static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
-{
- return NULL;
-}
-#endif /* CONFIG_CFS_BANDWIDTH */
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-
-/* Real-Time classes' related field in a runqueue: */
-struct rt_rq {
- struct rt_prio_array active;
- unsigned long rt_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
- struct {
- int curr; /* highest queued rt task prio */
-#ifdef CONFIG_SMP
- int next; /* next highest */
-#endif
- } highest_prio;
-#endif
-#ifdef CONFIG_SMP
- unsigned long rt_nr_migratory;
- unsigned long rt_nr_total;
- int overloaded;
- struct plist_head pushable_tasks;
-#endif
- int rt_throttled;
- u64 rt_time;
- u64 rt_runtime;
- /* Nests inside the rq lock: */
- raw_spinlock_t rt_runtime_lock;
-
-#ifdef CONFIG_RT_GROUP_SCHED
- unsigned long rt_nr_boosted;
-
- struct rq *rq;
- struct list_head leaf_rt_rq_list;
- struct task_group *tg;
-#endif
-};
-
-#ifdef CONFIG_SMP
-
-/*
- * We add the notion of a root-domain which will be used to define per-domain
- * variables. Each exclusive cpuset essentially defines an island domain by
- * fully partitioning the member cpus from any other cpuset. Whenever a new
- * exclusive cpuset is created, we also create and attach a new root-domain
- * object.
- *
- */
-struct root_domain {
- atomic_t refcount;
- atomic_t rto_count;
- struct rcu_head rcu;
- cpumask_var_t span;
- cpumask_var_t online;
-
- /*
- * The "RT overload" flag: it gets set if a CPU has more than
- * one runnable RT task.
- */
- cpumask_var_t rto_mask;
- struct cpupri cpupri;
-};
-
-/*
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
- */
-static struct root_domain def_root_domain;
-
-#endif /* CONFIG_SMP */
-
-/*
- * This is the main, per-CPU runqueue data structure.
- *
- * Locking rule: those places that want to lock multiple runqueues
- * (such as the load balancing or the thread migration code), lock
- * acquire operations must be ordered by ascending &runqueue.
- */
-struct rq {
- /* runqueue lock: */
- raw_spinlock_t lock;
-
- /*
- * nr_running and cpu_load should be in the same cacheline because
- * remote CPUs use both these fields when doing load calculation.
- */
- unsigned long nr_running;
- #define CPU_LOAD_IDX_MAX 5
- unsigned long cpu_load[CPU_LOAD_IDX_MAX];
- unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ
- u64 nohz_stamp;
- unsigned char nohz_balance_kick;
-#endif
- int skip_clock_update;
-
- /* capture load from *all* tasks on this cpu: */
- struct load_weight load;
- unsigned long nr_load_updates;
- u64 nr_switches;
-
- struct cfs_rq cfs;
- struct rt_rq rt;
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
- /* list of leaf cfs_rq on this cpu: */
- struct list_head leaf_cfs_rq_list;
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
- struct list_head leaf_rt_rq_list;
-#endif
-
- /*
- * This is part of a global counter where only the total sum
- * over all CPUs matters. A task can increase this counter on
- * one CPU and if it got migrated afterwards it may decrease
- * it on another CPU. Always updated under the runqueue lock:
- */
- unsigned long nr_uninterruptible;
-
- struct task_struct *curr, *idle, *stop;
- unsigned long next_balance;
- struct mm_struct *prev_mm;
-
- u64 clock;
- u64 clock_task;
-
- atomic_t nr_iowait;
-
-#ifdef CONFIG_SMP
- struct root_domain *rd;
- struct sched_domain *sd;
-
- unsigned long cpu_power;
-
- unsigned char idle_balance;
- /* For active balancing */
- int post_schedule;
- int active_balance;
- int push_cpu;
- struct cpu_stop_work active_balance_work;
- /* cpu of this runqueue: */
- int cpu;
- int online;
-
- u64 rt_avg;
- u64 age_stamp;
- u64 idle_stamp;
- u64 avg_idle;
-#endif
-
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- u64 prev_irq_time;
-#endif
-#ifdef CONFIG_PARAVIRT
- u64 prev_steal_time;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- u64 prev_steal_time_rq;
-#endif
-
- /* calc_load related fields */
- unsigned long calc_load_update;
- long calc_load_active;
-
-#ifdef CONFIG_SCHED_HRTICK
-#ifdef CONFIG_SMP
- int hrtick_csd_pending;
- struct call_single_data hrtick_csd;
-#endif
- struct hrtimer hrtick_timer;
-#endif
-
-#ifdef CONFIG_SCHEDSTATS
- /* latency stats */
- struct sched_info rq_sched_info;
- unsigned long long rq_cpu_time;
- /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
-
- /* sys_sched_yield() stats */
- unsigned int yld_count;
-
- /* schedule() stats */
- unsigned int sched_switch;
- unsigned int sched_count;
- unsigned int sched_goidle;
-
- /* try_to_wake_up() stats */
- unsigned int ttwu_count;
- unsigned int ttwu_local;
-#endif
-
-#ifdef CONFIG_SMP
- struct llist_head wake_list;
-#endif
-};
-
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-
-
-static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
-
-static inline int cpu_of(struct rq *rq)
-{
-#ifdef CONFIG_SMP
- return rq->cpu;
-#else
- return 0;
-#endif
-}
-
-#define rcu_dereference_check_sched_domain(p) \
- rcu_dereference_check((p), \
- lockdep_is_held(&sched_domains_mutex))
-
-/*
- * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See detach_destroy_domains: synchronize_sched for details.
- *
- * The domain tree of any CPU may only be accessed from within
- * preempt-disabled sections.
- */
-#define for_each_domain(cpu, __sd) \
- for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
-
-#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
-#define this_rq() (&__get_cpu_var(runqueues))
-#define task_rq(p) cpu_rq(task_cpu(p))
-#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-#define raw_rq() (&__raw_get_cpu_var(runqueues))
-
-#ifdef CONFIG_CGROUP_SCHED
-
-/*
- * Return the group to which this tasks belongs.
- *
- * We use task_subsys_state_check() and extend the RCU verification with
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * task it moves into the cgroup. Therefore by holding either of those locks,
- * we pin the task to the current cgroup.
- */
-static inline struct task_group *task_group(struct task_struct *p)
-{
- struct task_group *tg;
- struct cgroup_subsys_state *css;
-
- css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&task_rq(p)->lock));
- tg = container_of(css, struct task_group, css);
-
- return autogroup_task_group(p, tg);
-}
-
-/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
-{
-#ifdef CONFIG_FAIR_GROUP_SCHED
- p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
- p->se.parent = task_group(p)->se[cpu];
-#endif
-
-#ifdef CONFIG_RT_GROUP_SCHED
- p->rt.rt_rq = task_group(p)->rt_rq[cpu];
- p->rt.parent = task_group(p)->rt_se[cpu];
-#endif
-}
-
-#else /* CONFIG_CGROUP_SCHED */
-
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline struct task_group *task_group(struct task_struct *p)
-{
- return NULL;
-}
-
-#endif /* CONFIG_CGROUP_SCHED */
+DEFINE_MUTEX(sched_domains_mutex);
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
static void update_rq_clock_task(struct rq *rq, s64 delta);
-static void update_rq_clock(struct rq *rq)
+void update_rq_clock(struct rq *rq)
{
s64 delta;
@@ -802,44 +123,14 @@ static void update_rq_clock(struct rq *rq)
}
/*
- * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
- */
-#ifdef CONFIG_SCHED_DEBUG
-# define const_debug __read_mostly
-#else
-# define const_debug static const
-#endif
-
-/**
- * runqueue_is_locked - Returns true if the current cpu runqueue is locked
- * @cpu: the processor in question.
- *
- * This interface allows printk to be called with the runqueue lock
- * held and know whether or not it is OK to wake up the klogd.
- */
-int runqueue_is_locked(int cpu)
-{
- return raw_spin_is_locked(&cpu_rq(cpu)->lock);
-}
-
-/*
* Debugging: various feature bits
*/
#define SCHED_FEAT(name, enabled) \
- __SCHED_FEAT_##name ,
-
-enum {
-#include "sched_features.h"
-};
-
-#undef SCHED_FEAT
-
-#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
const_debug unsigned int sysctl_sched_features =
-#include "sched_features.h"
+#include "features.h"
0;
#undef SCHED_FEAT
@@ -849,7 +140,7 @@ const_debug unsigned int sysctl_sched_features =
#name ,
static __read_mostly char *sched_feat_names[] = {
-#include "sched_features.h"
+#include "features.h"
NULL
};
@@ -859,7 +150,7 @@ static int sched_feat_show(struct seq_file *m, void *v)
{
int i;
- for (i = 0; sched_feat_names[i]; i++) {
+ for (i = 0; i < __SCHED_FEAT_NR; i++) {
if (!(sysctl_sched_features & (1UL << i)))
seq_puts(m, "NO_");
seq_printf(m, "%s ", sched_feat_names[i]);
@@ -869,6 +160,36 @@ static int sched_feat_show(struct seq_file *m, void *v)
return 0;
}
+#ifdef HAVE_JUMP_LABEL
+
+#define jump_label_key__true jump_label_key_enabled
+#define jump_label_key__false jump_label_key_disabled
+
+#define SCHED_FEAT(name, enabled) \
+ jump_label_key__##enabled ,
+
+struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+
+#undef SCHED_FEAT
+
+static void sched_feat_disable(int i)
+{
+ if (jump_label_enabled(&sched_feat_keys[i]))
+ jump_label_dec(&sched_feat_keys[i]);
+}
+
+static void sched_feat_enable(int i)
+{
+ if (!jump_label_enabled(&sched_feat_keys[i]))
+ jump_label_inc(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
+
static ssize_t
sched_feat_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -892,17 +213,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
cmp += 3;
}
- for (i = 0; sched_feat_names[i]; i++) {
+ for (i = 0; i < __SCHED_FEAT_NR; i++) {
if (strcmp(cmp, sched_feat_names[i]) == 0) {
- if (neg)
+ if (neg) {
sysctl_sched_features &= ~(1UL << i);
- else
+ sched_feat_disable(i);
+ } else {
sysctl_sched_features |= (1UL << i);
+ sched_feat_enable(i);
+ }
break;
}
}
- if (!sched_feat_names[i])
+ if (i == __SCHED_FEAT_NR)
return -EINVAL;
*ppos += cnt;
@@ -931,10 +255,7 @@ static __init int sched_init_debug(void)
return 0;
}
late_initcall(sched_init_debug);
-
-#endif
-
-#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+#endif /* CONFIG_SCHED_DEBUG */
/*
* Number of tasks to iterate in a single balance run.
@@ -956,7 +277,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
*/
unsigned int sysctl_sched_rt_period = 1000000;
-static __read_mostly int scheduler_running;
+__read_mostly int scheduler_running;
/*
* part of the period that we allow rt tasks to run in us.
@@ -964,112 +285,7 @@ static __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
-static inline u64 global_rt_period(void)
-{
- return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
-}
-
-static inline u64 global_rt_runtime(void)
-{
- if (sysctl_sched_rt_runtime < 0)
- return RUNTIME_INF;
-
- return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
-}
-
-#ifndef prepare_arch_switch
-# define prepare_arch_switch(next) do { } while (0)
-#endif
-#ifndef finish_arch_switch
-# define finish_arch_switch(prev) do { } while (0)
-#endif
-
-static inline int task_current(struct rq *rq, struct task_struct *p)
-{
- return rq->curr == p;
-}
-
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
- return p->on_cpu;
-#else
- return task_current(rq, p);
-#endif
-}
-
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
- /*
- * We can optimise this out completely for !SMP, because the
- * SMP rebalancing from interrupt is the only thing that cares
- * here.
- */
- next->on_cpu = 1;
-#endif
-}
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
- /*
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
- * We must ensure this doesn't happen until the switch is completely
- * finished.
- */
- smp_wmb();
- prev->on_cpu = 0;
-#endif
-#ifdef CONFIG_DEBUG_SPINLOCK
- /* this is a valid case when another task releases the spinlock */
- rq->lock.owner = current;
-#endif
- /*
- * If we are tracking spinlock dependencies then we have to
- * fix up the runqueue lock - which gets 'carried over' from
- * prev into current:
- */
- spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-
- raw_spin_unlock_irq(&rq->lock);
-}
-
-#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
- /*
- * We can optimise this out completely for !SMP, because the
- * SMP rebalancing from interrupt is the only thing that cares
- * here.
- */
- next->on_cpu = 1;
-#endif
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- raw_spin_unlock_irq(&rq->lock);
-#else
- raw_spin_unlock(&rq->lock);
-#endif
-}
-
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
- /*
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
- * We must ensure this doesn't happen until the switch is completely
- * finished.
- */
- smp_wmb();
- prev->on_cpu = 0;
-#endif
-#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
- local_irq_enable();
-#endif
-}
-#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/*
* __task_rq_lock - lock the rq @p resides on.
@@ -1152,20 +368,6 @@ static struct rq *this_rq_lock(void)
* rq->lock.
*/
-/*
- * Use hrtick when:
- * - enabled by features
- * - hrtimer is actually high res
- */
-static inline int hrtick_enabled(struct rq *rq)
-{
- if (!sched_feat(HRTICK))
- return 0;
- if (!cpu_active(cpu_of(rq)))
- return 0;
- return hrtimer_is_hres_active(&rq->hrtick_timer);
-}
-
static void hrtick_clear(struct rq *rq)
{
if (hrtimer_active(&rq->hrtick_timer))
@@ -1209,7 +411,7 @@ static void __hrtick_start(void *arg)
*
* called with rq->lock held and irqs disabled
*/
-static void hrtick_start(struct rq *rq, u64 delay)
+void hrtick_start(struct rq *rq, u64 delay)
{
struct hrtimer *timer = &rq->hrtick_timer;
ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
@@ -1253,7 +455,7 @@ static __init void init_hrtick(void)
*
* called with rq->lock held and irqs disabled
*/
-static void hrtick_start(struct rq *rq, u64 delay)
+void hrtick_start(struct rq *rq, u64 delay)
{
__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
HRTIMER_MODE_REL_PINNED, 0);
@@ -1304,7 +506,7 @@ static inline void init_hrtick(void)
#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
#endif
-static void resched_task(struct task_struct *p)
+void resched_task(struct task_struct *p)
{
int cpu;
@@ -1325,7 +527,7 @@ static void resched_task(struct task_struct *p)
smp_send_reschedule(cpu);
}
-static void resched_cpu(int cpu)
+void resched_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
@@ -1406,7 +608,8 @@ void wake_up_idle_cpu(int cpu)
static inline bool got_nohz_idle_kick(void)
{
- return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
+ int cpu = smp_processor_id();
+ return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
}
#else /* CONFIG_NO_HZ */
@@ -1418,12 +621,7 @@ static inline bool got_nohz_idle_kick(void)
#endif /* CONFIG_NO_HZ */
-static u64 sched_avg_period(void)
-{
- return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
-}
-
-static void sched_avg_update(struct rq *rq)
+void sched_avg_update(struct rq *rq)
{
s64 period = sched_avg_period();
@@ -1439,193 +637,23 @@ static void sched_avg_update(struct rq *rq)
}
}
-static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
-{
- rq->rt_avg += rt_delta;
- sched_avg_update(rq);
-}
-
#else /* !CONFIG_SMP */
-static void resched_task(struct task_struct *p)
+void resched_task(struct task_struct *p)
{
assert_raw_spin_locked(&task_rq(p)->lock);
set_tsk_need_resched(p);
}
-
-static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
-{
-}
-
-static void sched_avg_update(struct rq *rq)
-{
-}
#endif /* CONFIG_SMP */
-#if BITS_PER_LONG == 32
-# define WMULT_CONST (~0UL)
-#else
-# define WMULT_CONST (1UL << 32)
-#endif
-
-#define WMULT_SHIFT 32
-
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
-
-/*
- * delta *= weight / lw
- */
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
- struct load_weight *lw)
-{
- u64 tmp;
-
- /*
- * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
- * entities since MIN_SHARES = 2. Treat weight as 1 if less than
- * 2^SCHED_LOAD_RESOLUTION.
- */
- if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
- tmp = (u64)delta_exec * scale_load_down(weight);
- else
- tmp = (u64)delta_exec;
-
- if (!lw->inv_weight) {
- unsigned long w = scale_load_down(lw->weight);
-
- if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
- lw->inv_weight = 1;
- else if (unlikely(!w))
- lw->inv_weight = WMULT_CONST;
- else
- lw->inv_weight = WMULT_CONST / w;
- }
-
- /*
- * Check whether we'd overflow the 64-bit multiplication:
- */
- if (unlikely(tmp > WMULT_CONST))
- tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
- WMULT_SHIFT/2);
- else
- tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
-
- return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
-}
-
-static inline void update_load_add(struct load_weight *lw, unsigned long inc)
-{
- lw->weight += inc;
- lw->inv_weight = 0;
-}
-
-static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
-{
- lw->weight -= dec;
- lw->inv_weight = 0;
-}
-
-static inline void update_load_set(struct load_weight *lw, unsigned long w)
-{
- lw->weight = w;
- lw->inv_weight = 0;
-}
-
-/*
- * To aid in avoiding the subversion of "niceness" due to uneven distribution
- * of tasks with abnormal "nice" values across CPUs the contribution that
- * each task makes to its run queue's load is weighted according to its
- * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
- * scaled version of the new time slice allocation that they receive on time
- * slice expiry etc.
- */
-
-#define WEIGHT_IDLEPRIO 3
-#define WMULT_IDLEPRIO 1431655765
-
-/*
- * Nice levels are multiplicative, with a gentle 10% change for every
- * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- * nice 1, it will get ~10% less CPU time than another CPU-bound task
- * that remained on nice 0.
- *
- * The "10% effect" is relative and cumulative: from _any_ nice level,
- * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
- * If a task goes up by ~10% and another task goes down by ~10% then
- * the relative distance between them is ~25%.)
- */
-static const int prio_to_weight[40] = {
- /* -20 */ 88761, 71755, 56483, 46273, 36291,
- /* -15 */ 29154, 23254, 18705, 14949, 11916,
- /* -10 */ 9548, 7620, 6100, 4904, 3906,
- /* -5 */ 3121, 2501, 1991, 1586, 1277,
- /* 0 */ 1024, 820, 655, 526, 423,
- /* 5 */ 335, 272, 215, 172, 137,
- /* 10 */ 110, 87, 70, 56, 45,
- /* 15 */ 36, 29, 23, 18, 15,
-};
-
-/*
- * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
- *
- * In cases where the weight does not change often, we can use the
- * precalculated inverse to speed up arithmetics by turning divisions
- * into multiplications:
- */
-static const u32 prio_to_wmult[40] = {
- /* -20 */ 48388, 59856, 76040, 92818, 118348,
- /* -15 */ 147320, 184698, 229616, 287308, 360437,
- /* -10 */ 449829, 563644, 704093, 875809, 1099582,
- /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
- /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
- /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
- /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
- /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
-};
-
-/* Time spent by the tasks of the cpu accounting group executing in ... */
-enum cpuacct_stat_index {
- CPUACCT_STAT_USER, /* ... user mode */
- CPUACCT_STAT_SYSTEM, /* ... kernel mode */
-
- CPUACCT_STAT_NSTATS,
-};
-
-#ifdef CONFIG_CGROUP_CPUACCT
-static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-static void cpuacct_update_stats(struct task_struct *tsk,
- enum cpuacct_stat_index idx, cputime_t val);
-#else
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
-static inline void cpuacct_update_stats(struct task_struct *tsk,
- enum cpuacct_stat_index idx, cputime_t val) {}
-#endif
-
-static inline void inc_cpu_load(struct rq *rq, unsigned long load)
-{
- update_load_add(&rq->load, load);
-}
-
-static inline void dec_cpu_load(struct rq *rq, unsigned long load)
-{
- update_load_sub(&rq->load, load);
-}
-
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
-typedef int (*tg_visitor)(struct task_group *, void *);
-
/*
* Iterate task_group tree rooted at *from, calling @down when first entering a
* node and @up when leaving it for the final time.
*
* Caller must hold rcu_lock or sufficient equivalent.
*/
-static int walk_tg_tree_from(struct task_group *from,
+int walk_tg_tree_from(struct task_group *from,
tg_visitor down, tg_visitor up, void *data)
{
struct task_group *parent, *child;
@@ -1656,270 +684,13 @@ out:
return ret;
}
-/*
- * Iterate the full tree, calling @down when first entering a node and @up when
- * leaving it for the final time.
- *
- * Caller must hold rcu_lock or sufficient equivalent.
- */
-
-static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
-{
- return walk_tg_tree_from(&root_task_group, down, up, data);
-}
-
-static int tg_nop(struct task_group *tg, void *data)
+int tg_nop(struct task_group *tg, void *data)
{
return 0;
}
#endif
-#ifdef CONFIG_SMP
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
-{
- return cpu_rq(cpu)->load.weight;
-}
-
-/*
- * Return a low guess at the load of a migration-source cpu weighted
- * according to the scheduling class and "nice" value.
- *
- * We want to under-estimate the load of migration sources, to
- * balance conservatively.
- */
-static unsigned long source_load(int cpu, int type)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
-
- if (type == 0 || !sched_feat(LB_BIAS))
- return total;
-
- return min(rq->cpu_load[type-1], total);
-}
-
-/*
- * Return a high guess at the load of a migration-target cpu weighted
- * according to the scheduling class and "nice" value.
- */
-static unsigned long target_load(int cpu, int type)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
-
- if (type == 0 || !sched_feat(LB_BIAS))
- return total;
-
- return max(rq->cpu_load[type-1], total);
-}
-
-static unsigned long power_of(int cpu)
-{
- return cpu_rq(cpu)->cpu_power;
-}
-
-static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
-
- if (nr_running)
- return rq->load.weight / nr_running;
-
- return 0;
-}
-
-#ifdef CONFIG_PREEMPT
-
-static void double_rq_lock(struct rq *rq1, struct rq *rq2);
-
-/*
- * fair double_lock_balance: Safely acquires both rq->locks in a fair
- * way at the expense of forcing extra atomic operations in all
- * invocations. This assures that the double_lock is acquired using the
- * same underlying policy as the spinlock_t on this architecture, which
- * reduces latency compared to the unfair variant below. However, it
- * also adds more overhead and therefore may reduce throughput.
- */
-static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
- __releases(this_rq->lock)
- __acquires(busiest->lock)
- __acquires(this_rq->lock)
-{
- raw_spin_unlock(&this_rq->lock);
- double_rq_lock(this_rq, busiest);
-
- return 1;
-}
-
-#else
-/*
- * Unfair double_lock_balance: Optimizes throughput at the expense of
- * latency by eliminating extra atomic operations when the locks are
- * already in proper order on entry. This favors lower cpu-ids and will
- * grant the double lock to lower cpus over higher ids under contention,
- * regardless of entry order into the function.
- */
-static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
- __releases(this_rq->lock)
- __acquires(busiest->lock)
- __acquires(this_rq->lock)
-{
- int ret = 0;
-
- if (unlikely(!raw_spin_trylock(&busiest->lock))) {
- if (busiest < this_rq) {
- raw_spin_unlock(&this_rq->lock);
- raw_spin_lock(&busiest->lock);
- raw_spin_lock_nested(&this_rq->lock,
- SINGLE_DEPTH_NESTING);
- ret = 1;
- } else
- raw_spin_lock_nested(&busiest->lock,
- SINGLE_DEPTH_NESTING);
- }
- return ret;
-}
-
-#endif /* CONFIG_PREEMPT */
-
-/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-{
- if (unlikely(!irqs_disabled())) {
- /* printk() doesn't work good under rq->lock */
- raw_spin_unlock(&this_rq->lock);
- BUG_ON(1);
- }
-
- return _double_lock_balance(this_rq, busiest);
-}
-
-static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
- __releases(busiest->lock)
-{
- raw_spin_unlock(&busiest->lock);
- lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
-}
-
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
- __acquires(rq1->lock)
- __acquires(rq2->lock)
-{
- BUG_ON(!irqs_disabled());
- if (rq1 == rq2) {
- raw_spin_lock(&rq1->lock);
- __acquire(rq2->lock); /* Fake it out ;) */
- } else {
- if (rq1 < rq2) {
- raw_spin_lock(&rq1->lock);
- raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
- } else {
- raw_spin_lock(&rq2->lock);
- raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
- }
- }
-}
-
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
- __releases(rq1->lock)
- __releases(rq2->lock)
-{
- raw_spin_unlock(&rq1->lock);
- if (rq1 != rq2)
- raw_spin_unlock(&rq2->lock);
- else
- __release(rq2->lock);
-}
-
-#else /* CONFIG_SMP */
-
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
- __acquires(rq1->lock)
- __acquires(rq2->lock)
-{
- BUG_ON(!irqs_disabled());
- BUG_ON(rq1 != rq2);
- raw_spin_lock(&rq1->lock);
- __acquire(rq2->lock); /* Fake it out ;) */
-}
-
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
- __releases(rq1->lock)
- __releases(rq2->lock)
-{
- BUG_ON(rq1 != rq2);
- raw_spin_unlock(&rq1->lock);
- __release(rq2->lock);
-}
-
-#endif
-
-static void calc_load_account_idle(struct rq *this_rq);
-static void update_sysctl(void);
-static int get_update_sysctl_factor(void);
-static void update_cpu_load(struct rq *this_rq);
-
-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
- set_task_rq(p, cpu);
-#ifdef CONFIG_SMP
- /*
- * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
- * successfully executed on another CPU. We must ensure that updates of
- * per-task data have been completed by this moment.
- */
- smp_wmb();
- task_thread_info(p)->cpu = cpu;
-#endif
-}
-
-static const struct sched_class rt_sched_class;
-
-#define sched_class_highest (&stop_sched_class)
-#define for_each_class(class) \
- for (class = sched_class_highest; class; class = class->next)
-
-#include "sched_stats.h"
-
-static void inc_nr_running(struct rq *rq)
-{
- rq->nr_running++;
-}
-
-static void dec_nr_running(struct rq *rq)
-{
- rq->nr_running--;
-}
+void update_cpu_load(struct rq *this_rq);
static void set_load_weight(struct task_struct *p)
{
@@ -1953,10 +724,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
p->sched_class->dequeue_task(rq, p, flags);
}
-/*
- * activate_task - move a task to the runqueue.
- */
-static void activate_task(struct rq *rq, struct task_struct *p, int flags)
+void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
rq->nr_uninterruptible--;
@@ -1964,10 +732,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
enqueue_task(rq, p, flags);
}
-/*
- * deactivate_task - remove a task from the runqueue.
- */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
+void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
rq->nr_uninterruptible++;
@@ -2158,14 +923,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int irqtime_account_hi_update(void)
{
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
u64 latest_ns;
int ret = 0;
local_irq_save(flags);
latest_ns = this_cpu_read(cpu_hardirq_time);
- if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+ if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
ret = 1;
local_irq_restore(flags);
return ret;
@@ -2173,14 +938,14 @@ static int irqtime_account_hi_update(void)
static int irqtime_account_si_update(void)
{
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
u64 latest_ns;
int ret = 0;
local_irq_save(flags);
latest_ns = this_cpu_read(cpu_softirq_time);
- if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+ if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
ret = 1;
local_irq_restore(flags);
return ret;
@@ -2192,15 +957,6 @@ static int irqtime_account_si_update(void)
#endif
-#include "sched_idletask.c"
-#include "sched_fair.c"
-#include "sched_rt.c"
-#include "sched_autogroup.c"
-#include "sched_stoptask.c"
-#ifdef CONFIG_SCHED_DEBUG
-# include "sched_debug.c"
-#endif
-
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -2298,7 +1054,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
p->sched_class->prio_changed(rq, p, oldprio);
}
-static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
const struct sched_class *class;
@@ -2324,38 +1080,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
-/*
- * Is this task likely cache-hot:
- */
-static int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
-{
- s64 delta;
-
- if (p->sched_class != &fair_sched_class)
- return 0;
-
- if (unlikely(p->policy == SCHED_IDLE))
- return 0;
-
- /*
- * Buddy candidates are cache hot:
- */
- if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
- (&p->se == cfs_rq_of(&p->se)->next ||
- &p->se == cfs_rq_of(&p->se)->last))
- return 1;
-
- if (sysctl_sched_migration_cost == -1)
- return 1;
- if (sysctl_sched_migration_cost == 0)
- return 0;
-
- delta = now - p->se.exec_start;
-
- return delta < (s64)sysctl_sched_migration_cost;
-}
-
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
@@ -2782,6 +1506,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
}
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+
+static inline int ttwu_share_cache(int this_cpu, int that_cpu)
+{
+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
+}
#endif /* CONFIG_SMP */
static void ttwu_queue(struct task_struct *p, int cpu)
@@ -2789,7 +1518,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
struct rq *rq = cpu_rq(cpu);
#if defined(CONFIG_SMP)
- if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+ if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
sched_clock_cpu(cpu); /* sync clocks x-cpu */
ttwu_queue_remote(p, cpu);
return;
@@ -3438,7 +2167,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
*/
static atomic_long_t calc_load_tasks_idle;
-static void calc_load_account_idle(struct rq *this_rq)
+void calc_load_account_idle(struct rq *this_rq)
{
long delta;
@@ -3582,7 +2311,7 @@ static void calc_global_nohz(unsigned long ticks)
*/
}
#else
-static void calc_load_account_idle(struct rq *this_rq)
+void calc_load_account_idle(struct rq *this_rq)
{
}
@@ -3725,7 +2454,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
* scheduler tick (TICK_NSEC). With tickless idle this will not be called
* every tick. We fix it up based on jiffies.
*/
-static void update_cpu_load(struct rq *this_rq)
+void update_cpu_load(struct rq *this_rq)
{
unsigned long this_load = this_rq->load.weight;
unsigned long curr_jiffies = jiffies;
@@ -3803,8 +2532,10 @@ unlock:
#endif
DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
/*
* Return any ns on the sched_clock that have not yet been accounted in
@@ -3857,6 +2588,42 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
+#ifdef CONFIG_CGROUP_CPUACCT
+struct cgroup_subsys cpuacct_subsys;
+struct cpuacct root_cpuacct;
+#endif
+
+static inline void task_group_account_field(struct task_struct *p, int index,
+ u64 tmp)
+{
+#ifdef CONFIG_CGROUP_CPUACCT
+ struct kernel_cpustat *kcpustat;
+ struct cpuacct *ca;
+#endif
+ /*
+ * Since all updates are sure to touch the root cgroup, we
+ * get ourselves ahead and touch it first. If the root cgroup
+ * is the only cgroup, then nothing else should be necessary.
+ *
+ */
+ __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
+
+#ifdef CONFIG_CGROUP_CPUACCT
+ if (unlikely(!cpuacct_subsys.active))
+ return;
+
+ rcu_read_lock();
+ ca = task_ca(p);
+ while (ca && (ca != &root_cpuacct)) {
+ kcpustat = this_cpu_ptr(ca->cpustat);
+ kcpustat->cpustat[index] += tmp;
+ ca = parent_ca(ca);
+ }
+ rcu_read_unlock();
+#endif
+}
+
+
/*
* Account user cpu time to a process.
* @p: the process that the cpu time gets accounted to
@@ -3866,22 +2633,18 @@ unsigned long long task_sched_runtime(struct task_struct *p)
void account_user_time(struct task_struct *p, cputime_t cputime,
cputime_t cputime_scaled)
{
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- cputime64_t tmp;
+ int index;
/* Add user time to process. */
- p->utime = cputime_add(p->utime, cputime);
- p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
+ p->utime += cputime;
+ p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
+ index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+
/* Add user time to cpustat. */
- tmp = cputime_to_cputime64(cputime);
- if (TASK_NICE(p) > 0)
- cpustat->nice = cputime64_add(cpustat->nice, tmp);
- else
- cpustat->user = cputime64_add(cpustat->user, tmp);
+ task_group_account_field(p, index, (__force u64) cputime);
- cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
/* Account for user time used */
acct_update_integrals(p);
}
@@ -3895,24 +2658,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
static void account_guest_time(struct task_struct *p, cputime_t cputime,
cputime_t cputime_scaled)
{
- cputime64_t tmp;
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-
- tmp = cputime_to_cputime64(cputime);
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
/* Add guest time to process. */
- p->utime = cputime_add(p->utime, cputime);
- p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
+ p->utime += cputime;
+ p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
- p->gtime = cputime_add(p->gtime, cputime);
+ p->gtime += cputime;
/* Add guest time to cpustat. */
if (TASK_NICE(p) > 0) {
- cpustat->nice = cputime64_add(cpustat->nice, tmp);
- cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
+ cpustat[CPUTIME_NICE] += (__force u64) cputime;
+ cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
} else {
- cpustat->user = cputime64_add(cpustat->user, tmp);
- cpustat->guest = cputime64_add(cpustat->guest, tmp);
+ cpustat[CPUTIME_USER] += (__force u64) cputime;
+ cpustat[CPUTIME_GUEST] += (__force u64) cputime;
}
}
@@ -3925,18 +2685,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
*/
static inline
void __account_system_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled, cputime64_t *target_cputime64)
+ cputime_t cputime_scaled, int index)
{
- cputime64_t tmp = cputime_to_cputime64(cputime);
-
/* Add system time to process. */
- p->stime = cputime_add(p->stime, cputime);
- p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+ p->stime += cputime;
+ p->stimescaled += cputime_scaled;
account_group_system_time(p, cputime);
/* Add system time to cpustat. */
- *target_cputime64 = cputime64_add(*target_cputime64, tmp);
- cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+ task_group_account_field(p, index, (__force u64) cputime);
/* Account for system time used */
acct_update_integrals(p);
@@ -3952,8 +2709,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
void account_system_time(struct task_struct *p, int hardirq_offset,
cputime_t cputime, cputime_t cputime_scaled)
{
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- cputime64_t *target_cputime64;
+ int index;
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
account_guest_time(p, cputime, cputime_scaled);
@@ -3961,13 +2717,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
}
if (hardirq_count() - hardirq_offset)
- target_cputime64 = &cpustat->irq;
+ index = CPUTIME_IRQ;
else if (in_serving_softirq())
- target_cputime64 = &cpustat->softirq;
+ index = CPUTIME_SOFTIRQ;
else
- target_cputime64 = &cpustat->system;
+ index = CPUTIME_SYSTEM;
- __account_system_time(p, cputime, cputime_scaled, target_cputime64);
+ __account_system_time(p, cputime, cputime_scaled, index);
}
/*
@@ -3976,10 +2732,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
*/
void account_steal_time(cputime_t cputime)
{
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- cputime64_t cputime64 = cputime_to_cputime64(cputime);
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
- cpustat->steal = cputime64_add(cpustat->steal, cputime64);
+ cpustat[CPUTIME_STEAL] += (__force u64) cputime;
}
/*
@@ -3988,14 +2743,13 @@ void account_steal_time(cputime_t cputime)
*/
void account_idle_time(cputime_t cputime)
{
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- cputime64_t cputime64 = cputime_to_cputime64(cputime);
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
struct rq *rq = this_rq();
if (atomic_read(&rq->nr_iowait) > 0)
- cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+ cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
else
- cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+ cpustat[CPUTIME_IDLE] += (__force u64) cputime;
}
static __always_inline bool steal_account_process_tick(void)
@@ -4045,16 +2799,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
struct rq *rq)
{
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
- cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
if (steal_account_process_tick())
return;
if (irqtime_account_hi_update()) {
- cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
} else if (irqtime_account_si_update()) {
- cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
} else if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
@@ -4062,7 +2815,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
* Also, p->stime needs to be updated for ksoftirqd.
*/
__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- &cpustat->softirq);
+ CPUTIME_SOFTIRQ);
} else if (user_tick) {
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
} else if (p == rq->idle) {
@@ -4071,7 +2824,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
} else {
__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- &cpustat->system);
+ CPUTIME_SYSTEM);
}
}
@@ -4170,7 +2923,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
- cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
+ cputime_t rtime, utime = p->utime, total = utime + p->stime;
/*
* Use CFS's precise accounting:
@@ -4178,11 +2931,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
if (total) {
- u64 temp = rtime;
+ u64 temp = (__force u64) rtime;
- temp *= utime;
- do_div(temp, total);
- utime = (cputime_t)temp;
+ temp *= (__force u64) utime;
+ do_div(temp, (__force u32) total);
+ utime = (__force cputime_t) temp;
} else
utime = rtime;
@@ -4190,7 +2943,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
* Compare with previous values, to keep monotonicity:
*/
p->prev_utime = max(p->prev_utime, utime);
- p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
+ p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
*ut = p->prev_utime;
*st = p->prev_stime;
@@ -4207,21 +2960,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
thread_group_cputime(p, &cputime);
- total = cputime_add(cputime.utime, cputime.stime);
+ total = cputime.utime + cputime.stime;
rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
if (total) {
- u64 temp = rtime;
+ u64 temp = (__force u64) rtime;
- temp *= cputime.utime;
- do_div(temp, total);
- utime = (cputime_t)temp;
+ temp *= (__force u64) cputime.utime;
+ do_div(temp, (__force u32) total);
+ utime = (__force cputime_t) temp;
} else
utime = rtime;
sig->prev_utime = max(sig->prev_utime, utime);
- sig->prev_stime = max(sig->prev_stime,
- cputime_sub(rtime, sig->prev_utime));
+ sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
*ut = sig->prev_utime;
*st = sig->prev_stime;
@@ -4320,6 +3072,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
{
struct pt_regs *regs = get_irq_regs();
+ if (oops_in_progress)
+ return;
+
printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
prev->comm, prev->pid, preempt_count());
@@ -4810,6 +3565,9 @@ EXPORT_SYMBOL(wait_for_completion);
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. The timeout is in jiffies. It is not
* interruptible.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
*/
unsigned long __sched
wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -4824,6 +3582,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
*
* This waits for completion of a specific task to be signaled. It is
* interruptible.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_interruptible(struct completion *x)
{
@@ -4841,6 +3601,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_interruptible_timeout(struct completion *x,
@@ -4856,6 +3619,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
*
* This waits to be signaled for completion of a specific task. It can be
* interrupted by a kill signal.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
*/
int __sched wait_for_completion_killable(struct completion *x)
{
@@ -4874,6 +3639,9 @@ EXPORT_SYMBOL(wait_for_completion_killable);
* This waits for either a completion of a specific task to be
* signaled or for a specified timeout to expire. It can be
* interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
*/
long __sched
wait_for_completion_killable_timeout(struct completion *x,
@@ -5360,7 +4128,7 @@ recheck:
on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
- deactivate_task(rq, p, 0);
+ dequeue_task(rq, p, 0);
if (running)
p->sched_class->put_prev_task(rq, p);
@@ -5373,7 +4141,7 @@ recheck:
if (running)
p->sched_class->set_curr_task(rq);
if (on_rq)
- activate_task(rq, p, 0);
+ enqueue_task(rq, p, 0);
check_class_changed(rq, p, prev_class, oldprio);
task_rq_unlock(rq, p, &flags);
@@ -5556,7 +4324,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
goto out_free_cpus_allowed;
}
retval = -EPERM;
- if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
+ if (!check_same_owner(p) && !ns_capable(task_user_ns(p), CAP_SYS_NICE))
goto out_unlock;
retval = security_task_setscheduler(p);
@@ -5838,6 +4606,13 @@ again:
*/
if (preempt && rq != p_rq)
resched_task(p_rq->curr);
+ } else {
+ /*
+ * We might have set it in task_yield_fair(), but are
+ * not going to schedule(), so don't want to skip
+ * the next update.
+ */
+ rq->skip_clock_update = 0;
}
out:
@@ -6005,7 +4780,7 @@ void sched_show_task(struct task_struct *p)
free = stack_not_used(p);
#endif
printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
- task_pid_nr(p), task_pid_nr(p->real_parent),
+ task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
(unsigned long)task_thread_info(p)->flags);
show_stack(p, NULL);
@@ -6099,53 +4874,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
*/
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
-}
-
-/*
- * Increase the granularity value when there are more CPUs,
- * because with more CPUs the 'effective latency' as visible
- * to users decreases. But the relationship is not linear,
- * so pick a second-best guess by going with the log2 of the
- * number of CPUs.
- *
- * This idea comes from the SD scheduler of Con Kolivas:
- */
-static int get_update_sysctl_factor(void)
-{
- unsigned int cpus = min_t(int, num_online_cpus(), 8);
- unsigned int factor;
-
- switch (sysctl_sched_tunable_scaling) {
- case SCHED_TUNABLESCALING_NONE:
- factor = 1;
- break;
- case SCHED_TUNABLESCALING_LINEAR:
- factor = cpus;
- break;
- case SCHED_TUNABLESCALING_LOG:
- default:
- factor = 1 + ilog2(cpus);
- break;
- }
-
- return factor;
-}
-
-static void update_sysctl(void)
-{
- unsigned int factor = get_update_sysctl_factor();
-
-#define SET_SYSCTL(name) \
- (sysctl_##name = (factor) * normalized_sysctl_##name)
- SET_SYSCTL(sched_min_granularity);
- SET_SYSCTL(sched_latency);
- SET_SYSCTL(sched_wakeup_granularity);
-#undef SET_SYSCTL
-}
-
-static inline void sched_init_granularity(void)
-{
- update_sysctl();
+#if defined(CONFIG_SMP)
+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
}
#ifdef CONFIG_SMP
@@ -6261,9 +4992,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
* placed properly.
*/
if (p->on_rq) {
- deactivate_task(rq_src, p, 0);
+ dequeue_task(rq_src, p, 0);
set_task_cpu(p, dest_cpu);
- activate_task(rq_dest, p, 0);
+ enqueue_task(rq_dest, p, 0);
check_preempt_curr(rq_dest, p, 0);
}
done:
@@ -6334,30 +5065,6 @@ static void calc_global_load_remove(struct rq *rq)
rq->calc_load_active = 0;
}
-#ifdef CONFIG_CFS_BANDWIDTH
-static void unthrottle_offline_cfs_rqs(struct rq *rq)
-{
- struct cfs_rq *cfs_rq;
-
- for_each_leaf_cfs_rq(rq, cfs_rq) {
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
- if (!cfs_rq->runtime_enabled)
- continue;
-
- /*
- * clock_task is not advancing so we just need to make sure
- * there's some valid quota amount
- */
- cfs_rq->runtime_remaining = cfs_b->quota;
- if (cfs_rq_throttled(cfs_rq))
- unthrottle_cfs_rq(cfs_rq);
- }
-}
-#else
-static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
-#endif
-
/*
* Migrate all tasks from the rq, sleeping tasks will be migrated by
* try_to_wake_up()->select_task_rq().
@@ -6463,7 +5170,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
static void
set_table_entry(struct ctl_table *entry,
const char *procname, void *data, int maxlen,
- mode_t mode, proc_handler *proc_handler)
+ umode_t mode, proc_handler *proc_handler)
{
entry->procname = procname;
entry->data = data;
@@ -6963,6 +5670,12 @@ out:
return -ENOMEM;
}
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+struct root_domain def_root_domain;
+
static void init_defrootdomain(void)
{
init_rootdomain(&def_root_domain);
@@ -7034,6 +5747,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
}
/*
+ * Keep a special pointer to the highest sched_domain that has
+ * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
+ * allows us to avoid some pointer chasing select_idle_sibling().
+ *
+ * Also keep a unique ID per domain (we use the first cpu number in
+ * the cpumask of the domain), this allows us to quickly tell if
+ * two cpus are in the same cache domain, see ttwu_share_cache().
+ */
+DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_id);
+
+static void update_top_cache_domain(int cpu)
+{
+ struct sched_domain *sd;
+ int id = cpu;
+
+ sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+ if (sd)
+ id = cpumask_first(sched_domain_span(sd));
+
+ rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ per_cpu(sd_llc_id, cpu) = id;
+}
+
+/*
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
*/
@@ -7072,6 +5810,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
tmp = rq->sd;
rcu_assign_pointer(rq->sd, sd);
destroy_sched_domains(tmp, cpu);
+
+ update_top_cache_domain(cpu);
}
/* cpus with isolated domains */
@@ -7231,7 +5971,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
continue;
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, cpu_to_node(i));
+ GFP_KERNEL, cpu_to_node(cpu));
if (!sg)
goto fail;
@@ -7369,6 +6109,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
return;
update_group_power(sd, cpu);
+ atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
+}
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+ return 0*SD_ASYM_PACKING;
}
/*
@@ -7923,54 +6669,52 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
}
#ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
- struct sysdev_class_attribute *attr,
- char *page)
+static ssize_t sched_mc_power_savings_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
{
- return sprintf(page, "%u\n", sched_mc_power_savings);
+ return sprintf(buf, "%u\n", sched_mc_power_savings);
}
-static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
- struct sysdev_class_attribute *attr,
+static ssize_t sched_mc_power_savings_store(struct device *dev,
+ struct device_attribute *attr,
const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 0);
}
-static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
- sched_mc_power_savings_show,
- sched_mc_power_savings_store);
+static DEVICE_ATTR(sched_mc_power_savings, 0644,
+ sched_mc_power_savings_show,
+ sched_mc_power_savings_store);
#endif
#ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
- struct sysdev_class_attribute *attr,
- char *page)
+static ssize_t sched_smt_power_savings_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
{
- return sprintf(page, "%u\n", sched_smt_power_savings);
+ return sprintf(buf, "%u\n", sched_smt_power_savings);
}
-static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
- struct sysdev_class_attribute *attr,
+static ssize_t sched_smt_power_savings_store(struct device *dev,
+ struct device_attribute *attr,
const char *buf, size_t count)
{
return sched_power_savings_store(buf, count, 1);
}
-static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
+static DEVICE_ATTR(sched_smt_power_savings, 0644,
sched_smt_power_savings_show,
sched_smt_power_savings_store);
#endif
-int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+int __init sched_create_sysfs_power_savings_entries(struct device *dev)
{
int err = 0;
#ifdef CONFIG_SCHED_SMT
if (smt_capable())
- err = sysfs_create_file(&cls->kset.kobj,
- &attr_sched_smt_power_savings.attr);
+ err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
#endif
#ifdef CONFIG_SCHED_MC
if (!err && mc_capable())
- err = sysfs_create_file(&cls->kset.kobj,
- &attr_sched_mc_power_savings.attr);
+ err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
#endif
return err;
}
@@ -7984,7 +6728,7 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
+ switch (action) {
case CPU_ONLINE:
case CPU_DOWN_FAILED:
cpuset_update_active_cpus();
@@ -7997,33 +6741,10 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_DOWN_PREPARE:
- cpuset_update_active_cpus();
- return NOTIFY_OK;
- default:
- return NOTIFY_DONE;
- }
-}
-
-static int update_runtime(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- int cpu = (int)(long)hcpu;
-
switch (action) {
case CPU_DOWN_PREPARE:
- case CPU_DOWN_PREPARE_FROZEN:
- disable_runtime(cpu_rq(cpu));
- return NOTIFY_OK;
-
- case CPU_DOWN_FAILED:
- case CPU_DOWN_FAILED_FROZEN:
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- enable_runtime(cpu_rq(cpu));
+ cpuset_update_active_cpus();
return NOTIFY_OK;
-
default:
return NOTIFY_DONE;
}
@@ -8077,104 +6798,11 @@ int in_sched_functions(unsigned long addr)
&& addr < (unsigned long)__sched_text_end);
}
-static void init_cfs_rq(struct cfs_rq *cfs_rq)
-{
- cfs_rq->tasks_timeline = RB_ROOT;
- INIT_LIST_HEAD(&cfs_rq->tasks);
- cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifndef CONFIG_64BIT
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
-}
-
-static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
-{
- struct rt_prio_array *array;
- int i;
-
- array = &rt_rq->active;
- for (i = 0; i < MAX_RT_PRIO; i++) {
- INIT_LIST_HEAD(array->queue + i);
- __clear_bit(i, array->bitmap);
- }
- /* delimiter for bitsearch: */
- __set_bit(MAX_RT_PRIO, array->bitmap);
-
-#if defined CONFIG_SMP
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
- rt_rq->highest_prio.next = MAX_RT_PRIO;
- rt_rq->rt_nr_migratory = 0;
- rt_rq->overloaded = 0;
- plist_head_init(&rt_rq->pushable_tasks);
-#endif
-
- rt_rq->rt_time = 0;
- rt_rq->rt_throttled = 0;
- rt_rq->rt_runtime = 0;
- raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
- struct sched_entity *se, int cpu,
- struct sched_entity *parent)
-{
- struct rq *rq = cpu_rq(cpu);
-
- cfs_rq->tg = tg;
- cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
- /* allow initial update_cfs_load() to truncate */
- cfs_rq->load_stamp = 1;
-#endif
- init_cfs_rq_runtime(cfs_rq);
-
- tg->cfs_rq[cpu] = cfs_rq;
- tg->se[cpu] = se;
-
- /* se could be NULL for root_task_group */
- if (!se)
- return;
-
- if (!parent)
- se->cfs_rq = &rq->cfs;
- else
- se->cfs_rq = parent->my_q;
-
- se->my_q = cfs_rq;
- update_load_set(&se->load, 0);
- se->parent = parent;
-}
+#ifdef CONFIG_CGROUP_SCHED
+struct task_group root_task_group;
#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
- struct sched_rt_entity *rt_se, int cpu,
- struct sched_rt_entity *parent)
-{
- struct rq *rq = cpu_rq(cpu);
-
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
- rt_rq->rt_nr_boosted = 0;
- rt_rq->rq = rq;
- rt_rq->tg = tg;
-
- tg->rt_rq[cpu] = rt_rq;
- tg->rt_se[cpu] = rt_se;
-
- if (!rt_se)
- return;
-
- if (!parent)
- rt_se->rt_rq = &rq->rt;
- else
- rt_se->rt_rq = parent->my_q;
-
- rt_se->my_q = rt_rq;
- rt_se->parent = parent;
- INIT_LIST_HEAD(&rt_se->run_list);
-}
-#endif
+DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
void __init sched_init(void)
{
@@ -8232,9 +6860,17 @@ void __init sched_init(void)
#ifdef CONFIG_CGROUP_SCHED
list_add(&root_task_group.list, &task_groups);
INIT_LIST_HEAD(&root_task_group.children);
+ INIT_LIST_HEAD(&root_task_group.siblings);
autogroup_init(&init_task);
+
#endif /* CONFIG_CGROUP_SCHED */
+#ifdef CONFIG_CGROUP_CPUACCT
+ root_cpuacct.cpustat = &kernel_cpustat;
+ root_cpuacct.cpuusage = alloc_percpu(u64);
+ /* Too early, not expected to fail */
+ BUG_ON(!root_cpuacct.cpuusage);
+#endif
for_each_possible_cpu(i) {
struct rq *rq;
@@ -8246,7 +6882,7 @@ void __init sched_init(void)
init_cfs_rq(&rq->cfs);
init_rt_rq(&rq->rt, rq);
#ifdef CONFIG_FAIR_GROUP_SCHED
- root_task_group.shares = root_task_group_load;
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
/*
* How much cpu bandwidth does root_task_group get?
@@ -8296,7 +6932,7 @@ void __init sched_init(void)
rq->avg_idle = 2*sysctl_sched_migration_cost;
rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ
- rq->nohz_balance_kick = 0;
+ rq->nohz_flags = 0;
#endif
#endif
init_rq_hrtick(rq);
@@ -8309,10 +6945,6 @@ void __init sched_init(void)
INIT_HLIST_HEAD(&init_task.preempt_notifiers);
#endif
-#ifdef CONFIG_SMP
- open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
-#endif
-
#ifdef CONFIG_RT_MUTEXES
plist_head_init(&init_task.pi_waiters);
#endif
@@ -8340,17 +6972,11 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
-#ifdef CONFIG_NO_HZ
- zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
- alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
- atomic_set(&nohz.load_balancer, nr_cpu_ids);
- atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
- atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
-#endif
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
-#endif /* SMP */
+#endif
+ init_sched_fair_class();
scheduler_running = 1;
}
@@ -8400,10 +7026,10 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
on_rq = p->on_rq;
if (on_rq)
- deactivate_task(rq, p, 0);
+ dequeue_task(rq, p, 0);
__setscheduler(rq, p, SCHED_NORMAL, 0);
if (on_rq) {
- activate_task(rq, p, 0);
+ enqueue_task(rq, p, 0);
resched_task(rq->curr);
}
@@ -8502,169 +7128,10 @@ void set_curr_task(int cpu, struct task_struct *p)
#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void free_fair_sched_group(struct task_group *tg)
-{
- int i;
-
- destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
-
- for_each_possible_cpu(i) {
- if (tg->cfs_rq)
- kfree(tg->cfs_rq[i]);
- if (tg->se)
- kfree(tg->se[i]);
- }
-
- kfree(tg->cfs_rq);
- kfree(tg->se);
-}
-
-static
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
-{
- struct cfs_rq *cfs_rq;
- struct sched_entity *se;
- int i;
-
- tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
- if (!tg->cfs_rq)
- goto err;
- tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
- if (!tg->se)
- goto err;
-
- tg->shares = NICE_0_LOAD;
-
- init_cfs_bandwidth(tg_cfs_bandwidth(tg));
-
- for_each_possible_cpu(i) {
- cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
- GFP_KERNEL, cpu_to_node(i));
- if (!cfs_rq)
- goto err;
-
- se = kzalloc_node(sizeof(struct sched_entity),
- GFP_KERNEL, cpu_to_node(i));
- if (!se)
- goto err_free_rq;
-
- init_cfs_rq(cfs_rq);
- init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
- }
-
- return 1;
-
-err_free_rq:
- kfree(cfs_rq);
-err:
- return 0;
-}
-
-static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
-
- /*
- * Only empty task groups can be destroyed; so we can speculatively
- * check on_list without danger of it being re-added.
- */
- if (!tg->cfs_rq[cpu]->on_list)
- return;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-#else /* !CONFIG_FAIR_GROUP_SCHED */
-static inline void free_fair_sched_group(struct task_group *tg)
-{
-}
-
-static inline
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
-{
- return 1;
-}
-
-static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-
-#ifdef CONFIG_RT_GROUP_SCHED
-static void free_rt_sched_group(struct task_group *tg)
-{
- int i;
-
- if (tg->rt_se)
- destroy_rt_bandwidth(&tg->rt_bandwidth);
-
- for_each_possible_cpu(i) {
- if (tg->rt_rq)
- kfree(tg->rt_rq[i]);
- if (tg->rt_se)
- kfree(tg->rt_se[i]);
- }
-
- kfree(tg->rt_rq);
- kfree(tg->rt_se);
-}
-
-static
-int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
-{
- struct rt_rq *rt_rq;
- struct sched_rt_entity *rt_se;
- int i;
-
- tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
- if (!tg->rt_rq)
- goto err;
- tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
- if (!tg->rt_se)
- goto err;
-
- init_rt_bandwidth(&tg->rt_bandwidth,
- ktime_to_ns(def_rt_bandwidth.rt_period), 0);
-
- for_each_possible_cpu(i) {
- rt_rq = kzalloc_node(sizeof(struct rt_rq),
- GFP_KERNEL, cpu_to_node(i));
- if (!rt_rq)
- goto err;
-
- rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
- GFP_KERNEL, cpu_to_node(i));
- if (!rt_se)
- goto err_free_rq;
-
- init_rt_rq(rt_rq, cpu_rq(i));
- rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
- init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
- }
-
- return 1;
-
-err_free_rq:
- kfree(rt_rq);
-err:
- return 0;
-}
-#else /* !CONFIG_RT_GROUP_SCHED */
-static inline void free_rt_sched_group(struct task_group *tg)
-{
-}
-
-static inline
-int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
-{
- return 1;
-}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
#ifdef CONFIG_CGROUP_SCHED
+/* task_group_lock serializes the addition/removal of task groups */
+static DEFINE_SPINLOCK(task_group_lock);
+
static void free_sched_group(struct task_group *tg)
{
free_fair_sched_group(tg);
@@ -8769,50 +7236,6 @@ void sched_move_task(struct task_struct *tsk)
}
#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static DEFINE_MUTEX(shares_mutex);
-
-int sched_group_set_shares(struct task_group *tg, unsigned long shares)
-{
- int i;
- unsigned long flags;
-
- /*
- * We can't change the weight of the root cgroup.
- */
- if (!tg->se[0])
- return -EINVAL;
-
- shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
-
- mutex_lock(&shares_mutex);
- if (tg->shares == shares)
- goto done;
-
- tg->shares = shares;
- for_each_possible_cpu(i) {
- struct rq *rq = cpu_rq(i);
- struct sched_entity *se;
-
- se = tg->se[i];
- /* Propagate contribution to hierarchy */
- raw_spin_lock_irqsave(&rq->lock, flags);
- for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se));
- raw_spin_unlock_irqrestore(&rq->lock, flags);
- }
-
-done:
- mutex_unlock(&shares_mutex);
- return 0;
-}
-
-unsigned long sched_group_shares(struct task_group *tg)
-{
- return tg->shares;
-}
-#endif
-
#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
static unsigned long to_ratio(u64 period, u64 runtime)
{
@@ -8835,7 +7258,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
struct task_struct *g, *p;
do_each_thread(g, p) {
- if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+ if (rt_task(p) && task_rq(p)->rt.tg == tg)
return 1;
} while_each_thread(g, p);
@@ -9127,24 +7550,31 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
sched_destroy_group(tg);
}
-static int
-cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, cgrp, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
- if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
- return -EINVAL;
+ if (!sched_rt_can_attach(cgroup_tg(cgrp), task))
+ return -EINVAL;
#else
- /* We don't support RT-tasks being in separate groups */
- if (tsk->sched_class != &fair_sched_class)
- return -EINVAL;
+ /* We don't support RT-tasks being in separate groups */
+ if (task->sched_class != &fair_sched_class)
+ return -EINVAL;
#endif
+ }
return 0;
}
-static void
-cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
- sched_move_task(tsk);
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, cgrp, tset)
+ sched_move_task(task);
}
static void
@@ -9186,8 +7616,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
{
- int i, ret = 0, runtime_enabled;
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+ int i, ret = 0, runtime_enabled, runtime_was_enabled;
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
if (tg == &root_task_group)
return -EINVAL;
@@ -9214,6 +7644,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
goto out_unlock;
runtime_enabled = quota != RUNTIME_INF;
+ runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
+ account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
@@ -9229,13 +7661,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
for_each_possible_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
- struct rq *rq = rq_of(cfs_rq);
+ struct rq *rq = cfs_rq->rq;
raw_spin_lock_irq(&rq->lock);
cfs_rq->runtime_enabled = runtime_enabled;
cfs_rq->runtime_remaining = 0;
- if (cfs_rq_throttled(cfs_rq))
+ if (cfs_rq->throttled)
unthrottle_cfs_rq(cfs_rq);
raw_spin_unlock_irq(&rq->lock);
}
@@ -9249,7 +7681,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
{
u64 quota, period;
- period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+ period = ktime_to_ns(tg->cfs_bandwidth.period);
if (cfs_quota_us < 0)
quota = RUNTIME_INF;
else
@@ -9262,10 +7694,10 @@ long tg_get_cfs_quota(struct task_group *tg)
{
u64 quota_us;
- if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
+ if (tg->cfs_bandwidth.quota == RUNTIME_INF)
return -1;
- quota_us = tg_cfs_bandwidth(tg)->quota;
+ quota_us = tg->cfs_bandwidth.quota;
do_div(quota_us, NSEC_PER_USEC);
return quota_us;
@@ -9276,10 +7708,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
u64 quota, period;
period = (u64)cfs_period_us * NSEC_PER_USEC;
- quota = tg_cfs_bandwidth(tg)->quota;
-
- if (period <= 0)
- return -EINVAL;
+ quota = tg->cfs_bandwidth.quota;
return tg_set_cfs_bandwidth(tg, period, quota);
}
@@ -9288,7 +7717,7 @@ long tg_get_cfs_period(struct task_group *tg)
{
u64 cfs_period_us;
- cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+ cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
do_div(cfs_period_us, NSEC_PER_USEC);
return cfs_period_us;
@@ -9348,13 +7777,13 @@ static u64 normalize_cfs_quota(struct task_group *tg,
static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
{
struct cfs_schedulable_data *d = data;
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
s64 quota = 0, parent_quota = -1;
if (!tg->parent) {
quota = RUNTIME_INF;
} else {
- struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
+ struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
quota = normalize_cfs_quota(tg, d);
parent_quota = parent_b->hierarchal_quota;
@@ -9398,7 +7827,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
struct cgroup_map_cb *cb)
{
struct task_group *tg = cgroup_tg(cgrp);
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
cb->fill(cb, "nr_periods", cfs_b->nr_periods);
cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
@@ -9480,8 +7909,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
.create = cpu_cgroup_create,
.destroy = cpu_cgroup_destroy,
- .can_attach_task = cpu_cgroup_can_attach_task,
- .attach_task = cpu_cgroup_attach_task,
+ .can_attach = cpu_cgroup_can_attach,
+ .attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
@@ -9499,38 +7928,16 @@ struct cgroup_subsys cpu_cgroup_subsys = {
* (balbir@in.ibm.com).
*/
-/* track cpu usage of a group of tasks and its child groups */
-struct cpuacct {
- struct cgroup_subsys_state css;
- /* cpuusage holds pointer to a u64-type object on every cpu */
- u64 __percpu *cpuusage;
- struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
- struct cpuacct *parent;
-};
-
-struct cgroup_subsys cpuacct_subsys;
-
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
-{
- return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
- struct cpuacct, css);
-}
-
-/* return cpu accounting group to which this task belongs */
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
-{
- return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
- struct cpuacct, css);
-}
-
/* create a new cpu accounting group */
static struct cgroup_subsys_state *cpuacct_create(
struct cgroup_subsys *ss, struct cgroup *cgrp)
{
- struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
- int i;
+ struct cpuacct *ca;
+
+ if (!cgrp->parent)
+ return &root_cpuacct.css;
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
goto out;
@@ -9538,18 +7945,13 @@ static struct cgroup_subsys_state *cpuacct_create(
if (!ca->cpuusage)
goto out_free_ca;
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- if (percpu_counter_init(&ca->cpustat[i], 0))
- goto out_free_counters;
-
- if (cgrp->parent)
- ca->parent = cgroup_ca(cgrp->parent);
+ ca->cpustat = alloc_percpu(struct kernel_cpustat);
+ if (!ca->cpustat)
+ goto out_free_cpuusage;
return &ca->css;
-out_free_counters:
- while (--i >= 0)
- percpu_counter_destroy(&ca->cpustat[i]);
+out_free_cpuusage:
free_percpu(ca->cpuusage);
out_free_ca:
kfree(ca);
@@ -9562,10 +7964,8 @@ static void
cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cpuacct *ca = cgroup_ca(cgrp);
- int i;
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
- percpu_counter_destroy(&ca->cpustat[i]);
+ free_percpu(ca->cpustat);
free_percpu(ca->cpuusage);
kfree(ca);
}
@@ -9658,16 +8058,31 @@ static const char *cpuacct_stat_desc[] = {
};
static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
- struct cgroup_map_cb *cb)
+ struct cgroup_map_cb *cb)
{
struct cpuacct *ca = cgroup_ca(cgrp);
- int i;
+ int cpu;
+ s64 val = 0;
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
- s64 val = percpu_counter_read(&ca->cpustat[i]);
- val = cputime64_to_clock_t(val);
- cb->fill(cb, cpuacct_stat_desc[i], val);
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_USER];
+ val += kcpustat->cpustat[CPUTIME_NICE];
}
+ val = cputime64_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+
+ val = 0;
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_SYSTEM];
+ val += kcpustat->cpustat[CPUTIME_IRQ];
+ val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+ }
+
+ val = cputime64_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+
return 0;
}
@@ -9697,7 +8112,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
*
* called with rq->lock held.
*/
-static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
int cpu;
@@ -9711,7 +8126,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
ca = task_ca(tsk);
- for (; ca; ca = ca->parent) {
+ for (; ca; ca = parent_ca(ca)) {
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime;
}
@@ -9719,45 +8134,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
rcu_read_unlock();
}
-/*
- * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
- * in cputime_t units. As a result, cpuacct_update_stats calls
- * percpu_counter_add with values large enough to always overflow the
- * per cpu batch limit causing bad SMP scalability.
- *
- * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
- * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
- * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
- */
-#ifdef CONFIG_SMP
-#define CPUACCT_BATCH \
- min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
-#else
-#define CPUACCT_BATCH 0
-#endif
-
-/*
- * Charge the system/user time to the task's accounting group.
- */
-static void cpuacct_update_stats(struct task_struct *tsk,
- enum cpuacct_stat_index idx, cputime_t val)
-{
- struct cpuacct *ca;
- int batch = CPUACCT_BATCH;
-
- if (unlikely(!cpuacct_subsys.active))
- return;
-
- rcu_read_lock();
- ca = task_ca(tsk);
-
- do {
- __percpu_counter_add(&ca->cpustat[idx], val, batch);
- ca = ca->parent;
- } while (ca);
- rcu_read_unlock();
-}
-
struct cgroup_subsys cpuacct_subsys = {
.name = "cpuacct",
.create = cpuacct_create,
diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c
index a86cf9d9eb1..d72586fdf66 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,5 +1,5 @@
/*
- * kernel/sched_cpupri.c
+ * kernel/sched/cpupri.c
*
* CPU priority management
*
@@ -28,7 +28,7 @@
*/
#include <linux/gfp.h>
-#include "sched_cpupri.h"
+#include "cpupri.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
static int convert_prio(int prio)
@@ -129,7 +129,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
* cpupri_set - update the cpu priority setting
* @cp: The cpupri context
* @cpu: The target cpu
- * @pri: The priority (INVALID-RT99) to assign to this CPU
+ * @newpri: The priority (INVALID-RT99) to assign to this CPU
*
* Note: Assumes cpu_rq(cpu)->lock is locked
*
@@ -200,7 +200,6 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
/**
* cpupri_init - initialize the cpupri structure
* @cp: The cpupri context
- * @bootmem: true if allocations need to use bootmem
*
* Returns: -ENOMEM if memory fails.
*/
diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h
index f6d75617349..f6d75617349 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched/cpupri.h
diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c
index a6710a112b4..2a075e10004 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched/debug.c
@@ -1,5 +1,5 @@
/*
- * kernel/time/sched_debug.c
+ * kernel/sched/debug.c
*
* Print the CFS rbtree
*
@@ -16,6 +16,8 @@
#include <linux/kallsyms.h>
#include <linux/utsname.h>
+#include "sched.h"
+
static DEFINE_SPINLOCK(sched_debug_lock);
/*
@@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
return 0;
}
-static void sysrq_sched_debug_show(void)
+void sysrq_sched_debug_show(void)
{
sched_debug_show(NULL, NULL);
}
diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c
index 5c9e67923b7..aca16b843b7 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,13 @@
#include <linux/latencytop.h>
#include <linux/sched.h>
#include <linux/cpumask.h>
+#include <linux/slab.h>
+#include <linux/profile.h>
+#include <linux/interrupt.h>
+
+#include <trace/events/sched.h>
+
+#include "sched.h"
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -103,7 +110,110 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
-static const struct sched_class fair_sched_class;
+/*
+ * Increase the granularity value when there are more CPUs,
+ * because with more CPUs the 'effective latency' as visible
+ * to users decreases. But the relationship is not linear,
+ * so pick a second-best guess by going with the log2 of the
+ * number of CPUs.
+ *
+ * This idea comes from the SD scheduler of Con Kolivas:
+ */
+static int get_update_sysctl_factor(void)
+{
+ unsigned int cpus = min_t(int, num_online_cpus(), 8);
+ unsigned int factor;
+
+ switch (sysctl_sched_tunable_scaling) {
+ case SCHED_TUNABLESCALING_NONE:
+ factor = 1;
+ break;
+ case SCHED_TUNABLESCALING_LINEAR:
+ factor = cpus;
+ break;
+ case SCHED_TUNABLESCALING_LOG:
+ default:
+ factor = 1 + ilog2(cpus);
+ break;
+ }
+
+ return factor;
+}
+
+static void update_sysctl(void)
+{
+ unsigned int factor = get_update_sysctl_factor();
+
+#define SET_SYSCTL(name) \
+ (sysctl_##name = (factor) * normalized_sysctl_##name)
+ SET_SYSCTL(sched_min_granularity);
+ SET_SYSCTL(sched_latency);
+ SET_SYSCTL(sched_wakeup_granularity);
+#undef SET_SYSCTL
+}
+
+void sched_init_granularity(void)
+{
+ update_sysctl();
+}
+
+#if BITS_PER_LONG == 32
+# define WMULT_CONST (~0UL)
+#else
+# define WMULT_CONST (1UL << 32)
+#endif
+
+#define WMULT_SHIFT 32
+
+/*
+ * Shift right and round:
+ */
+#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+
+/*
+ * delta *= weight / lw
+ */
+static unsigned long
+calc_delta_mine(unsigned long delta_exec, unsigned long weight,
+ struct load_weight *lw)
+{
+ u64 tmp;
+
+ /*
+ * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+ * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+ * 2^SCHED_LOAD_RESOLUTION.
+ */
+ if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+ tmp = (u64)delta_exec * scale_load_down(weight);
+ else
+ tmp = (u64)delta_exec;
+
+ if (!lw->inv_weight) {
+ unsigned long w = scale_load_down(lw->weight);
+
+ if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+ lw->inv_weight = 1;
+ else if (unlikely(!w))
+ lw->inv_weight = WMULT_CONST;
+ else
+ lw->inv_weight = WMULT_CONST / w;
+ }
+
+ /*
+ * Check whether we'd overflow the 64-bit multiplication:
+ */
+ if (unlikely(tmp > WMULT_CONST))
+ tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+ WMULT_SHIFT/2);
+ else
+ tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+
+ return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+}
+
+
+const struct sched_class fair_sched_class;
/**************************************************************
* CFS operations on generic schedulable entities:
@@ -413,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
-static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
+struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = cfs_rq->rb_leftmost;
@@ -434,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
}
#ifdef CONFIG_SCHED_DEBUG
-static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -684,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_add(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
- inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+ update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
if (entity_is_task(se)) {
add_cfs_task_weight(cfs_rq, se->load.weight);
list_add(&se->group_node, &cfs_rq->tasks);
@@ -697,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
update_load_sub(&cfs_rq->load, se->load.weight);
if (!parent_entity(se))
- dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+ update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
if (entity_is_task(se)) {
add_cfs_task_weight(cfs_rq, -se->load.weight);
list_del_init(&se->group_node);
@@ -772,19 +882,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
list_del_leaf_cfs_rq(cfs_rq);
}
+static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+{
+ long tg_weight;
+
+ /*
+ * Use this CPU's actual weight instead of the last load_contribution
+ * to gain a more accurate current total weight. See
+ * update_cfs_rq_load_contribution().
+ */
+ tg_weight = atomic_read(&tg->load_weight);
+ tg_weight -= cfs_rq->load_contribution;
+ tg_weight += cfs_rq->load.weight;
+
+ return tg_weight;
+}
+
static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
- long load_weight, load, shares;
+ long tg_weight, load, shares;
+ tg_weight = calc_tg_weight(tg, cfs_rq);
load = cfs_rq->load.weight;
- load_weight = atomic_read(&tg->load_weight);
- load_weight += load;
- load_weight -= cfs_rq->load_contribution;
-
shares = (tg->shares * load);
- if (load_weight)
- shares /= load_weight;
+ if (tg_weight)
+ shares /= tg_weight;
if (shares < MIN_SHARES)
shares = MIN_SHARES;
@@ -907,6 +1030,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
trace_sched_stat_iowait(tsk, delta);
}
+ trace_sched_stat_blocked(tsk, delta);
+
/*
* Blocking time is in units of nanosecs, so shift by
* 20 to get a milliseconds-range estimation of the
@@ -1274,6 +1399,32 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
*/
#ifdef CONFIG_CFS_BANDWIDTH
+
+#ifdef HAVE_JUMP_LABEL
+static struct jump_label_key __cfs_bandwidth_used;
+
+static inline bool cfs_bandwidth_used(void)
+{
+ return static_branch(&__cfs_bandwidth_used);
+}
+
+void account_cfs_bandwidth_used(int enabled, int was_enabled)
+{
+ /* only need to count groups transitioning between enabled/!enabled */
+ if (enabled && !was_enabled)
+ jump_label_inc(&__cfs_bandwidth_used);
+ else if (!enabled && was_enabled)
+ jump_label_dec(&__cfs_bandwidth_used);
+}
+#else /* HAVE_JUMP_LABEL */
+static bool cfs_bandwidth_used(void)
+{
+ return true;
+}
+
+void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+#endif /* HAVE_JUMP_LABEL */
+
/*
* default period for cfs group bandwidth.
* default: 0.1s, units: nanoseconds
@@ -1295,7 +1446,7 @@ static inline u64 sched_cfs_bandwidth_slice(void)
*
* requires cfs_b->lock
*/
-static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
{
u64 now;
@@ -1307,6 +1458,11 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
}
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+ return &tg->cfs_bandwidth;
+}
+
/* returns 0 on failure to allocate runtime */
static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -1408,7 +1564,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec)
{
- if (!cfs_rq->runtime_enabled)
+ if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
return;
__account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1416,13 +1572,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
- return cfs_rq->throttled;
+ return cfs_bandwidth_used() && cfs_rq->throttled;
}
/* check whether cfs_rq, or any parent, is throttled */
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
{
- return cfs_rq->throttle_count;
+ return cfs_bandwidth_used() && cfs_rq->throttle_count;
}
/*
@@ -1517,7 +1673,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
raw_spin_unlock(&cfs_b->lock);
}
-static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
@@ -1743,7 +1899,10 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
- if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+ if (!cfs_bandwidth_used())
+ return;
+
+ if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
return;
__return_cfs_rq_runtime(cfs_rq);
@@ -1788,6 +1947,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
*/
static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
{
+ if (!cfs_bandwidth_used())
+ return;
+
/* an active group must be handled by the update_curr()->put() path */
if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return;
@@ -1805,6 +1967,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
/* conditionally throttle active cfs_rq's from put_prev_entity() */
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
+ if (!cfs_bandwidth_used())
+ return;
+
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
return;
@@ -1817,7 +1982,112 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
throttle_cfs_rq(cfs_rq);
}
-#else
+
+static inline u64 default_cfs_period(void);
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, slack_timer);
+ do_sched_cfs_slack_timer(cfs_b);
+
+ return HRTIMER_NORESTART;
+}
+
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+ struct cfs_bandwidth *cfs_b =
+ container_of(timer, struct cfs_bandwidth, period_timer);
+ ktime_t now;
+ int overrun;
+ int idle = 0;
+
+ for (;;) {
+ now = hrtimer_cb_get_time(timer);
+ overrun = hrtimer_forward(timer, now, cfs_b->period);
+
+ if (!overrun)
+ break;
+
+ idle = do_sched_cfs_period_timer(cfs_b, overrun);
+ }
+
+ return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ raw_spin_lock_init(&cfs_b->lock);
+ cfs_b->runtime = 0;
+ cfs_b->quota = RUNTIME_INF;
+ cfs_b->period = ns_to_ktime(default_cfs_period());
+
+ INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
+ hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_b->period_timer.function = sched_cfs_period_timer;
+ hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cfs_b->slack_timer.function = sched_cfs_slack_timer;
+}
+
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->runtime_enabled = 0;
+ INIT_LIST_HEAD(&cfs_rq->throttled_list);
+}
+
+/* requires cfs_b->lock, may release to reprogram timer */
+void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ /*
+ * The timer may be active because we're trying to set a new bandwidth
+ * period or because we're racing with the tear-down path
+ * (timer_active==0 becomes visible before the hrtimer call-back
+ * terminates). In either case we ensure that it's re-programmed
+ */
+ while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+ raw_spin_unlock(&cfs_b->lock);
+ /* ensure cfs_b->lock is available while we wait */
+ hrtimer_cancel(&cfs_b->period_timer);
+
+ raw_spin_lock(&cfs_b->lock);
+ /* if someone else restarted the timer then we're done */
+ if (cfs_b->timer_active)
+ return;
+ }
+
+ cfs_b->timer_active = 1;
+ start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+}
+
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+ hrtimer_cancel(&cfs_b->period_timer);
+ hrtimer_cancel(&cfs_b->slack_timer);
+}
+
+void unthrottle_offline_cfs_rqs(struct rq *rq)
+{
+ struct cfs_rq *cfs_rq;
+
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+
+ if (!cfs_rq->runtime_enabled)
+ continue;
+
+ /*
+ * clock_task is not advancing so we just need to make sure
+ * there's some valid quota amount
+ */
+ cfs_rq->runtime_remaining = cfs_b->quota;
+ if (cfs_rq_throttled(cfs_rq))
+ unthrottle_cfs_rq(cfs_rq);
+ }
+}
+
+#else /* CONFIG_CFS_BANDWIDTH */
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
unsigned long delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -1839,8 +2109,22 @@ static inline int throttled_lb_pair(struct task_group *tg,
{
return 0;
}
+
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
#endif
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+ return NULL;
+}
+static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
/**************************************************
* CFS operations on tasks:
*/
@@ -1853,7 +2137,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
WARN_ON(task_rq(p) != rq);
- if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
+ if (cfs_rq->nr_running > 1) {
u64 slice = sched_slice(cfs_rq, se);
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
s64 delta = slice - ran;
@@ -1884,7 +2168,7 @@ static void hrtick_update(struct rq *rq)
{
struct task_struct *curr = rq->curr;
- if (curr->sched_class != &fair_sched_class)
+ if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
return;
if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
@@ -2007,6 +2291,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+ return cpu_rq(cpu)->load.weight;
+}
+
+/*
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+static unsigned long source_load(int cpu, int type)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long total = weighted_cpuload(cpu);
+
+ if (type == 0 || !sched_feat(LB_BIAS))
+ return total;
+
+ return min(rq->cpu_load[type-1], total);
+}
+
+/*
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
+ */
+static unsigned long target_load(int cpu, int type)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long total = weighted_cpuload(cpu);
+
+ if (type == 0 || !sched_feat(LB_BIAS))
+ return total;
+
+ return max(rq->cpu_load[type-1], total);
+}
+
+static unsigned long power_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_power;
+}
+
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+
+ if (nr_running)
+ return rq->load.weight / nr_running;
+
+ return 0;
+}
+
static void task_waking_fair(struct task_struct *p)
{
@@ -2036,36 +2375,100 @@ static void task_waking_fair(struct task_struct *p)
* Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one
* can calculate the shift in shares.
+ *
+ * Calculate the effective load difference if @wl is added (subtracted) to @tg
+ * on this @cpu and results in a total addition (subtraction) of @wg to the
+ * total group weight.
+ *
+ * Given a runqueue weight distribution (rw_i) we can compute a shares
+ * distribution (s_i) using:
+ *
+ * s_i = rw_i / \Sum rw_j (1)
+ *
+ * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
+ * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
+ * shares distribution (s_i):
+ *
+ * rw_i = { 2, 4, 1, 0 }
+ * s_i = { 2/7, 4/7, 1/7, 0 }
+ *
+ * As per wake_affine() we're interested in the load of two CPUs (the CPU the
+ * task used to run on and the CPU the waker is running on), we need to
+ * compute the effect of waking a task on either CPU and, in case of a sync
+ * wakeup, compute the effect of the current task going to sleep.
+ *
+ * So for a change of @wl to the local @cpu with an overall group weight change
+ * of @wl we can compute the new shares distribution (s'_i) using:
+ *
+ * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
+ *
+ * Suppose we're interested in CPUs 0 and 1, and want to compute the load
+ * differences in waking a task to CPU 0. The additional task changes the
+ * weight and shares distributions like:
+ *
+ * rw'_i = { 3, 4, 1, 0 }
+ * s'_i = { 3/8, 4/8, 1/8, 0 }
+ *
+ * We can then compute the difference in effective weight by using:
+ *
+ * dw_i = S * (s'_i - s_i) (3)
+ *
+ * Where 'S' is the group weight as seen by its parent.
+ *
+ * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
+ * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
+ * 4/7) times the weight of the group.
*/
static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
- if (!tg->parent)
+ if (!tg->parent) /* the trivial, non-cgroup case */
return wl;
for_each_sched_entity(se) {
- long lw, w;
+ long w, W;
tg = se->my_q->tg;
- w = se->my_q->load.weight;
- /* use this cpu's instantaneous contribution */
- lw = atomic_read(&tg->load_weight);
- lw -= se->my_q->load_contribution;
- lw += w + wg;
+ /*
+ * W = @wg + \Sum rw_j
+ */
+ W = wg + calc_tg_weight(tg, se->my_q);
- wl += w;
+ /*
+ * w = rw_i + @wl
+ */
+ w = se->my_q->load.weight + wl;
- if (lw > 0 && wl < lw)
- wl = (wl * tg->shares) / lw;
+ /*
+ * wl = S * s'_i; see (2)
+ */
+ if (W > 0 && w < W)
+ wl = (w * tg->shares) / W;
else
wl = tg->shares;
- /* zero point is MIN_SHARES */
+ /*
+ * Per the above, wl is the new se->load.weight value; since
+ * those are clipped to [MIN_SHARES, ...) do so now. See
+ * calc_cfs_shares().
+ */
if (wl < MIN_SHARES)
wl = MIN_SHARES;
+
+ /*
+ * wl = dw_i = S * (s'_i - s_i); see (3)
+ */
wl -= se->load.weight;
+
+ /*
+ * Recursively apply this logic to all parent groups to compute
+ * the final effective load change on the root group. Since
+ * only the @tg group gets extra weight, all parent groups can
+ * only redistribute existing shares. @wl is the shift in shares
+ * resulting from this level per the above.
+ */
wg = 0;
}
@@ -2249,6 +2652,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
struct sched_domain *sd;
+ struct sched_group *sg;
int i;
/*
@@ -2269,25 +2673,28 @@ static int select_idle_sibling(struct task_struct *p, int target)
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
rcu_read_lock();
- for_each_domain(target, sd) {
- if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
- break;
- for_each_cpu_and(i, sched_domain_span(sd), tsk_cpus_allowed(p)) {
- if (idle_cpu(i)) {
- target = i;
- break;
+ sd = rcu_dereference(per_cpu(sd_llc, target));
+ for_each_lower_domain(sd) {
+ sg = sd->groups;
+ do {
+ if (!cpumask_intersects(sched_group_cpus(sg),
+ tsk_cpus_allowed(p)))
+ goto next;
+
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ if (!idle_cpu(i))
+ goto next;
}
- }
- /*
- * Lets stop looking for an idle sibling when we reached
- * the domain that spans the current cpu and prev_cpu.
- */
- if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
- cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
- break;
+ target = cpumask_first_and(sched_group_cpus(sg),
+ tsk_cpus_allowed(p));
+ goto done;
+next:
+ sg = sg->next;
+ } while (sg != sd->groups);
}
+done:
rcu_read_unlock();
return target;
@@ -2315,6 +2722,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
int want_sd = 1;
int sync = wake_flags & WF_SYNC;
+ if (p->rt.nr_cpus_allowed == 1)
+ return prev_cpu;
+
if (sd_flag & SD_BALANCE_WAKE) {
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
want_affine = 1;
@@ -2599,7 +3009,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
} while (cfs_rq);
p = task_of(se);
- hrtick_start_fair(rq, p);
+ if (hrtick_enabled(rq))
+ hrtick_start_fair(rq, p);
return p;
}
@@ -2643,6 +3054,12 @@ static void yield_task_fair(struct rq *rq)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+ /*
+ * Tell update_rq_clock() that we've just updated,
+ * so we don't do microscopic update in schedule()
+ * and double the fastpath cost.
+ */
+ rq->skip_clock_update = 1;
}
set_skip_buddy(se);
@@ -2683,12 +3100,50 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
}
/*
+ * Is this task likely cache-hot:
+ */
+static int
+task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+{
+ s64 delta;
+
+ if (p->sched_class != &fair_sched_class)
+ return 0;
+
+ if (unlikely(p->policy == SCHED_IDLE))
+ return 0;
+
+ /*
+ * Buddy candidates are cache hot:
+ */
+ if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
+ (&p->se == cfs_rq_of(&p->se)->next ||
+ &p->se == cfs_rq_of(&p->se)->last))
+ return 1;
+
+ if (sysctl_sched_migration_cost == -1)
+ return 1;
+ if (sysctl_sched_migration_cost == 0)
+ return 0;
+
+ delta = now - p->se.exec_start;
+
+ return delta < (s64)sysctl_sched_migration_cost;
+}
+
+#define LBF_ALL_PINNED 0x01
+#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */
+#define LBF_HAD_BREAK 0x04
+#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */
+#define LBF_ABORT 0x10
+
+/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
static
int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned)
+ int *lb_flags)
{
int tsk_cache_hot = 0;
/*
@@ -2701,7 +3156,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
return 0;
}
- *all_pinned = 0;
+ *lb_flags &= ~LBF_ALL_PINNED;
if (task_running(rq, p)) {
schedstat_inc(p, se.statistics.nr_failed_migrations_running);
@@ -2775,7 +3230,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
static unsigned long
balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move, struct sched_domain *sd,
- enum cpu_idle_type idle, int *all_pinned,
+ enum cpu_idle_type idle, int *lb_flags,
struct cfs_rq *busiest_cfs_rq)
{
int loops = 0, pulled = 0;
@@ -2786,12 +3241,14 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
goto out;
list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
- if (loops++ > sysctl_sched_nr_migrate)
+ if (loops++ > sysctl_sched_nr_migrate) {
+ *lb_flags |= LBF_NEED_BREAK;
break;
+ }
if ((p->se.load.weight >> 1) > rem_load_move ||
!can_migrate_task(p, busiest, this_cpu, sd, idle,
- all_pinned))
+ lb_flags))
continue;
pull_task(busiest, p, this_rq, this_cpu);
@@ -2804,8 +3261,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
* kernels will stop after the first task is pulled to minimize
* the critical section.
*/
- if (idle == CPU_NEWLY_IDLE)
+ if (idle == CPU_NEWLY_IDLE) {
+ *lb_flags |= LBF_ABORT;
break;
+ }
#endif
/*
@@ -2910,7 +3369,7 @@ static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned)
+ int *lb_flags)
{
long rem_load_move = max_load_move;
struct cfs_rq *busiest_cfs_rq;
@@ -2923,6 +3382,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long busiest_weight = busiest_cfs_rq->load.weight;
u64 rem_load, moved_load;
+ if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+ break;
+
/*
* empty group or part of a throttled hierarchy
*/
@@ -2934,7 +3396,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
rem_load = div_u64(rem_load, busiest_h_load + 1);
moved_load = balance_tasks(this_rq, this_cpu, busiest,
- rem_load, sd, idle, all_pinned,
+ rem_load, sd, idle, lb_flags,
busiest_cfs_rq);
if (!moved_load)
@@ -2960,10 +3422,10 @@ static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned)
+ int *lb_flags)
{
return balance_tasks(this_rq, this_cpu, busiest,
- max_load_move, sd, idle, all_pinned,
+ max_load_move, sd, idle, lb_flags,
&busiest->cfs);
}
#endif
@@ -2978,29 +3440,30 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned)
+ int *lb_flags)
{
unsigned long total_load_moved = 0, load_moved;
do {
load_moved = load_balance_fair(this_rq, this_cpu, busiest,
max_load_move - total_load_moved,
- sd, idle, all_pinned);
+ sd, idle, lb_flags);
total_load_moved += load_moved;
+ if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+ break;
+
#ifdef CONFIG_PREEMPT
/*
* NEWIDLE balancing is a source of latency, so preemptible
* kernels will stop after the first task is pulled to minimize
* the critical section.
*/
- if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
- break;
-
- if (raw_spin_is_contended(&this_rq->lock) ||
- raw_spin_is_contended(&busiest->lock))
+ if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
+ *lb_flags |= LBF_ABORT;
break;
+ }
#endif
} while (load_moved && max_load_move > total_load_moved);
@@ -3062,15 +3525,6 @@ struct sg_lb_stats {
};
/**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
- return cpumask_first(sched_group_cpus(group));
-}
-
-/**
* get_sd_load_idx - Obtain the load index for a given sched domain.
* @sd: The sched_domain whose load_idx is to be obtained.
* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
@@ -3319,7 +3773,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
sdg->sgp->power = power;
}
-static void update_group_power(struct sched_domain *sd, int cpu)
+void update_group_power(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
@@ -3511,7 +3965,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
}
/**
- * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @sd: sched_domain whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
@@ -3585,11 +4039,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
} while (sg != sd->groups);
}
-int __weak arch_sd_sibling_asym_packing(void)
-{
- return 0*SD_ASYM_PACKING;
-}
-
/**
* check_asym_packing - Check to see if the group is packed into the
* sched doman.
@@ -3953,7 +4402,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
#define MAX_PINNED_INTERVAL 512
/* Working cpumask for load_balance and load_balance_newidle. */
-static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
static int need_active_balance(struct sched_domain *sd, int idle,
int busiest_cpu, int this_cpu)
@@ -4004,7 +4453,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *balance)
{
- int ld_moved, all_pinned = 0, active_balance = 0;
+ int ld_moved, lb_flags = 0, active_balance = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
@@ -4045,11 +4494,11 @@ redo:
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
- all_pinned = 1;
+ lb_flags |= LBF_ALL_PINNED;
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
ld_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, idle, &all_pinned);
+ imbalance, sd, idle, &lb_flags);
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
@@ -4059,8 +4508,18 @@ redo:
if (ld_moved && this_cpu != smp_processor_id())
resched_cpu(this_cpu);
+ if (lb_flags & LBF_ABORT)
+ goto out_balanced;
+
+ if (lb_flags & LBF_NEED_BREAK) {
+ lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK;
+ if (lb_flags & LBF_ABORT)
+ goto out_balanced;
+ goto redo;
+ }
+
/* All tasks on this runqueue were pinned by CPU affinity */
- if (unlikely(all_pinned)) {
+ if (unlikely(lb_flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
if (!cpumask_empty(cpus))
goto redo;
@@ -4090,7 +4549,7 @@ redo:
tsk_cpus_allowed(busiest->curr))) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
- all_pinned = 1;
+ lb_flags |= LBF_ALL_PINNED;
goto out_one_pinned;
}
@@ -4143,7 +4602,8 @@ out_balanced:
out_one_pinned:
/* tune up the balancing interval */
- if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+ if (((lb_flags & LBF_ALL_PINNED) &&
+ sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
@@ -4156,7 +4616,7 @@ out:
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
-static void idle_balance(int this_cpu, struct rq *this_rq)
+void idle_balance(int this_cpu, struct rq *this_rq)
{
struct sched_domain *sd;
int pulled_task = 0;
@@ -4271,28 +4731,16 @@ out_unlock:
#ifdef CONFIG_NO_HZ
/*
* idle load balancing details
- * - One of the idle CPUs nominates itself as idle load_balancer, while
- * entering idle.
- * - This idle load balancer CPU will also go into tickless mode when
- * it is idle, just like all other idle CPUs
* - When one of the busy CPUs notice that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
*/
static struct {
- atomic_t load_balancer;
- atomic_t first_pick_cpu;
- atomic_t second_pick_cpu;
cpumask_var_t idle_cpus_mask;
- cpumask_var_t grp_idle_mask;
+ atomic_t nr_cpus;
unsigned long next_balance; /* in jiffy units */
} nohz ____cacheline_aligned;
-int get_nohz_load_balancer(void)
-{
- return atomic_read(&nohz.load_balancer);
-}
-
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
/**
* lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -4329,33 +4777,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
(sd && (sd->flags & flag)); sd = sd->parent)
/**
- * is_semi_idle_group - Checks if the given sched_group is semi-idle.
- * @ilb_group: group to be checked for semi-idleness
- *
- * Returns: 1 if the group is semi-idle. 0 otherwise.
- *
- * We define a sched_group to be semi idle if it has atleast one idle-CPU
- * and atleast one non-idle CPU. This helper function checks if the given
- * sched_group is semi-idle or not.
- */
-static inline int is_semi_idle_group(struct sched_group *ilb_group)
-{
- cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
- sched_group_cpus(ilb_group));
-
- /*
- * A sched_group is semi-idle when it has atleast one busy cpu
- * and atleast one idle cpu.
- */
- if (cpumask_empty(nohz.grp_idle_mask))
- return 0;
-
- if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
- return 0;
-
- return 1;
-}
-/**
* find_new_ilb - Finds the optimum idle load balancer for nomination.
* @cpu: The cpu which is nominating a new idle_load_balancer.
*
@@ -4369,9 +4790,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group)
*/
static int find_new_ilb(int cpu)
{
+ int ilb = cpumask_first(nohz.idle_cpus_mask);
+ struct sched_group *ilbg;
struct sched_domain *sd;
- struct sched_group *ilb_group;
- int ilb = nr_cpu_ids;
/*
* Have idle load balancer selection from semi-idle packages only
@@ -4389,23 +4810,28 @@ static int find_new_ilb(int cpu)
rcu_read_lock();
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
- ilb_group = sd->groups;
+ ilbg = sd->groups;
do {
- if (is_semi_idle_group(ilb_group)) {
- ilb = cpumask_first(nohz.grp_idle_mask);
+ if (ilbg->group_weight !=
+ atomic_read(&ilbg->sgp->nr_busy_cpus)) {
+ ilb = cpumask_first_and(nohz.idle_cpus_mask,
+ sched_group_cpus(ilbg));
goto unlock;
}
- ilb_group = ilb_group->next;
+ ilbg = ilbg->next;
- } while (ilb_group != sd->groups);
+ } while (ilbg != sd->groups);
}
unlock:
rcu_read_unlock();
out_done:
- return ilb;
+ if (ilb < nr_cpu_ids && idle_cpu(ilb))
+ return ilb;
+
+ return nr_cpu_ids;
}
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
static inline int find_new_ilb(int call_cpu)
@@ -4425,102 +4851,98 @@ static void nohz_balancer_kick(int cpu)
nohz.next_balance++;
- ilb_cpu = get_nohz_load_balancer();
+ ilb_cpu = find_new_ilb(cpu);
- if (ilb_cpu >= nr_cpu_ids) {
- ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
- if (ilb_cpu >= nr_cpu_ids)
- return;
- }
+ if (ilb_cpu >= nr_cpu_ids)
+ return;
- if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
- cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
+ if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
+ return;
+ /*
+ * Use smp_send_reschedule() instead of resched_cpu().
+ * This way we generate a sched IPI on the target cpu which
+ * is idle. And the softirq performing nohz idle load balance
+ * will be run before returning from the IPI.
+ */
+ smp_send_reschedule(ilb_cpu);
+ return;
+}
- smp_mb();
- /*
- * Use smp_send_reschedule() instead of resched_cpu().
- * This way we generate a sched IPI on the target cpu which
- * is idle. And the softirq performing nohz idle load balance
- * will be run before returning from the IPI.
- */
- smp_send_reschedule(ilb_cpu);
+static inline void clear_nohz_tick_stopped(int cpu)
+{
+ if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+ clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
- return;
}
-/*
- * This routine will try to nominate the ilb (idle load balancing)
- * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus.
- *
- * When the ilb owner becomes busy, we will not have new ilb owner until some
- * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
- * idle load balancing by kicking one of the idle CPUs.
- *
- * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
- * ilb owner CPU in future (when there is a need for idle load balancing on
- * behalf of all idle CPUs).
- */
-void select_nohz_load_balancer(int stop_tick)
+static inline void set_cpu_sd_state_busy(void)
{
+ struct sched_domain *sd;
int cpu = smp_processor_id();
- if (stop_tick) {
- if (!cpu_active(cpu)) {
- if (atomic_read(&nohz.load_balancer) != cpu)
- return;
+ if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
+ return;
+ clear_bit(NOHZ_IDLE, nohz_flags(cpu));
- /*
- * If we are going offline and still the leader,
- * give up!
- */
- if (atomic_cmpxchg(&nohz.load_balancer, cpu,
- nr_cpu_ids) != cpu)
- BUG();
+ rcu_read_lock();
+ for_each_domain(cpu, sd)
+ atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+ rcu_read_unlock();
+}
- return;
- }
+void set_cpu_sd_state_idle(void)
+{
+ struct sched_domain *sd;
+ int cpu = smp_processor_id();
- cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+ if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
+ return;
+ set_bit(NOHZ_IDLE, nohz_flags(cpu));
- if (atomic_read(&nohz.first_pick_cpu) == cpu)
- atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
- if (atomic_read(&nohz.second_pick_cpu) == cpu)
- atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
+ rcu_read_lock();
+ for_each_domain(cpu, sd)
+ atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+ rcu_read_unlock();
+}
- if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
- int new_ilb;
+/*
+ * This routine will record that this cpu is going idle with tick stopped.
+ * This info will be used in performing idle load balancing in the future.
+ */
+void select_nohz_load_balancer(int stop_tick)
+{
+ int cpu = smp_processor_id();
- /* make me the ilb owner */
- if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
- cpu) != nr_cpu_ids)
- return;
+ /*
+ * If this cpu is going down, then nothing needs to be done.
+ */
+ if (!cpu_active(cpu))
+ return;
- /*
- * Check to see if there is a more power-efficient
- * ilb.
- */
- new_ilb = find_new_ilb(cpu);
- if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
- atomic_set(&nohz.load_balancer, nr_cpu_ids);
- resched_cpu(new_ilb);
- return;
- }
- return;
- }
- } else {
- if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
+ if (stop_tick) {
+ if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
return;
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
-
- if (atomic_read(&nohz.load_balancer) == cpu)
- if (atomic_cmpxchg(&nohz.load_balancer, cpu,
- nr_cpu_ids) != cpu)
- BUG();
+ cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_inc(&nohz.nr_cpus);
+ set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
return;
}
+
+static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DYING:
+ clear_nohz_tick_stopped(smp_processor_id());
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
#endif
static DEFINE_SPINLOCK(balancing);
@@ -4531,7 +4953,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
* Scale the max load_balance interval with the number of CPUs in the system.
* This trades load-balance latency on larger machines for less cross talk.
*/
-static void update_max_interval(void)
+void update_max_interval(void)
{
max_load_balance_interval = HZ*num_online_cpus()/10;
}
@@ -4623,11 +5045,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
struct rq *rq;
int balance_cpu;
- if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
- return;
+ if (idle != CPU_IDLE ||
+ !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
+ goto end;
for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
- if (balance_cpu == this_cpu)
+ if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;
/*
@@ -4635,10 +5058,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
* work being done for other cpus. Next load
* balancing owner will pick it up.
*/
- if (need_resched()) {
- this_rq->nohz_balance_kick = 0;
+ if (need_resched())
break;
- }
raw_spin_lock_irq(&this_rq->lock);
update_rq_clock(this_rq);
@@ -4652,53 +5073,71 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
this_rq->next_balance = rq->next_balance;
}
nohz.next_balance = this_rq->next_balance;
- this_rq->nohz_balance_kick = 0;
+end:
+ clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
}
/*
- * Current heuristic for kicking the idle load balancer
- * - first_pick_cpu is the one of the busy CPUs. It will kick
- * idle load balancer when it has more than one process active. This
- * eliminates the need for idle load balancing altogether when we have
- * only one running process in the system (common case).
- * - If there are more than one busy CPU, idle load balancer may have
- * to run for active_load_balance to happen (i.e., two busy CPUs are
- * SMT or core siblings and can run better if they move to different
- * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
- * which will kick idle load balancer as soon as it has any load.
+ * Current heuristic for kicking the idle load balancer in the presence
+ * of an idle cpu is the system.
+ * - This rq has more than one task.
+ * - At any scheduler domain level, this cpu's scheduler group has multiple
+ * busy cpu's exceeding the group's power.
+ * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
+ * domain span are idle.
*/
static inline int nohz_kick_needed(struct rq *rq, int cpu)
{
unsigned long now = jiffies;
- int ret;
- int first_pick_cpu, second_pick_cpu;
+ struct sched_domain *sd;
- if (time_before(now, nohz.next_balance))
+ if (unlikely(idle_cpu(cpu)))
return 0;
- if (idle_cpu(cpu))
- return 0;
+ /*
+ * We may be recently in ticked or tickless idle mode. At the first
+ * busy tick after returning from idle, we will update the busy stats.
+ */
+ set_cpu_sd_state_busy();
+ clear_nohz_tick_stopped(cpu);
- first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
- second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
+ /*
+ * None are in tickless mode and hence no need for NOHZ idle load
+ * balancing.
+ */
+ if (likely(!atomic_read(&nohz.nr_cpus)))
+ return 0;
- if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
- second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
+ if (time_before(now, nohz.next_balance))
return 0;
- ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
- if (ret == nr_cpu_ids || ret == cpu) {
- atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
- if (rq->nr_running > 1)
- return 1;
- } else {
- ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
- if (ret == nr_cpu_ids || ret == cpu) {
- if (rq->nr_running)
- return 1;
- }
+ if (rq->nr_running >= 2)
+ goto need_kick;
+
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ struct sched_group *sg = sd->groups;
+ struct sched_group_power *sgp = sg->sgp;
+ int nr_busy = atomic_read(&sgp->nr_busy_cpus);
+
+ if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
+ goto need_kick_unlock;
+
+ if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
+ && (cpumask_first_and(nohz.idle_cpus_mask,
+ sched_domain_span(sd)) < cpu))
+ goto need_kick_unlock;
+
+ if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
+ break;
}
+ rcu_read_unlock();
return 0;
+
+need_kick_unlock:
+ rcu_read_unlock();
+need_kick:
+ return 1;
}
#else
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
@@ -4733,14 +5172,14 @@ static inline int on_null_domain(int cpu)
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
-static inline void trigger_load_balance(struct rq *rq, int cpu)
+void trigger_load_balance(struct rq *rq, int cpu)
{
/* Don't need to rebalance while attached to NULL domain */
if (time_after_eq(jiffies, rq->next_balance) &&
likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);
#ifdef CONFIG_NO_HZ
- else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+ if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
nohz_balancer_kick(cpu);
#endif
}
@@ -4755,15 +5194,6 @@ static void rq_offline_fair(struct rq *rq)
update_sysctl();
}
-#else /* CONFIG_SMP */
-
-/*
- * on UP we do not need to balance between CPUs:
- */
-static inline void idle_balance(int cpu, struct rq *rq)
-{
-}
-
#endif /* CONFIG_SMP */
/*
@@ -4787,8 +5217,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
*/
static void task_fork_fair(struct task_struct *p)
{
- struct cfs_rq *cfs_rq = task_cfs_rq(current);
- struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se = &p->se, *curr;
int this_cpu = smp_processor_id();
struct rq *rq = this_rq();
unsigned long flags;
@@ -4797,6 +5227,9 @@ static void task_fork_fair(struct task_struct *p)
update_rq_clock(rq);
+ cfs_rq = task_cfs_rq(current);
+ curr = cfs_rq->curr;
+
if (unlikely(task_cpu(p) != this_cpu)) {
rcu_read_lock();
__set_task_cpu(p, this_cpu);
@@ -4906,6 +5339,16 @@ static void set_curr_task_fair(struct rq *rq)
}
}
+void init_cfs_rq(struct cfs_rq *cfs_rq)
+{
+ cfs_rq->tasks_timeline = RB_ROOT;
+ INIT_LIST_HEAD(&cfs_rq->tasks);
+ cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+ cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_move_group_fair(struct task_struct *p, int on_rq)
{
@@ -4922,13 +5365,182 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* to another cgroup's rq. This does somewhat interfere with the
* fair sleeper stuff for the first placement, but who cares.
*/
+ /*
+ * When !on_rq, vruntime of the task has usually NOT been normalized.
+ * But there are some cases where it has already been normalized:
+ *
+ * - Moving a forked child which is waiting for being woken up by
+ * wake_up_new_task().
+ * - Moving a task which has been woken up by try_to_wake_up() and
+ * waiting for actually being woken up by sched_ttwu_pending().
+ *
+ * To prevent boost or penalty in the new cfs_rq caused by delta
+ * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
+ */
+ if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+ on_rq = 1;
+
if (!on_rq)
p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
set_task_rq(p, task_cpu(p));
if (!on_rq)
p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
}
+
+void free_fair_sched_group(struct task_group *tg)
+{
+ int i;
+
+ destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
+ for_each_possible_cpu(i) {
+ if (tg->cfs_rq)
+ kfree(tg->cfs_rq[i]);
+ if (tg->se)
+ kfree(tg->se[i]);
+ }
+
+ kfree(tg->cfs_rq);
+ kfree(tg->se);
+}
+
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ struct cfs_rq *cfs_rq;
+ struct sched_entity *se;
+ int i;
+
+ tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
+ if (!tg->cfs_rq)
+ goto err;
+ tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
+ if (!tg->se)
+ goto err;
+
+ tg->shares = NICE_0_LOAD;
+
+ init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
+ for_each_possible_cpu(i) {
+ cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!cfs_rq)
+ goto err;
+
+ se = kzalloc_node(sizeof(struct sched_entity),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!se)
+ goto err_free_rq;
+
+ init_cfs_rq(cfs_rq);
+ init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
+ }
+
+ return 1;
+
+err_free_rq:
+ kfree(cfs_rq);
+err:
+ return 0;
+}
+
+void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ /*
+ * Only empty task groups can be destroyed; so we can speculatively
+ * check on_list without danger of it being re-added.
+ */
+ if (!tg->cfs_rq[cpu]->on_list)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+ struct sched_entity *se, int cpu,
+ struct sched_entity *parent)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ cfs_rq->tg = tg;
+ cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+ /* allow initial update_cfs_load() to truncate */
+ cfs_rq->load_stamp = 1;
#endif
+ init_cfs_rq_runtime(cfs_rq);
+
+ tg->cfs_rq[cpu] = cfs_rq;
+ tg->se[cpu] = se;
+
+ /* se could be NULL for root_task_group */
+ if (!se)
+ return;
+
+ if (!parent)
+ se->cfs_rq = &rq->cfs;
+ else
+ se->cfs_rq = parent->my_q;
+
+ se->my_q = cfs_rq;
+ update_load_set(&se->load, 0);
+ se->parent = parent;
+}
+
+static DEFINE_MUTEX(shares_mutex);
+
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+ int i;
+ unsigned long flags;
+
+ /*
+ * We can't change the weight of the root cgroup.
+ */
+ if (!tg->se[0])
+ return -EINVAL;
+
+ shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
+
+ mutex_lock(&shares_mutex);
+ if (tg->shares == shares)
+ goto done;
+
+ tg->shares = shares;
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ struct sched_entity *se;
+
+ se = tg->se[i];
+ /* Propagate contribution to hierarchy */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ for_each_sched_entity(se)
+ update_cfs_shares(group_cfs_rq(se));
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+done:
+ mutex_unlock(&shares_mutex);
+ return 0;
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+
+void free_fair_sched_group(struct task_group *tg) { }
+
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ return 1;
+}
+
+void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
{
@@ -4948,7 +5560,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
/*
* All the scheduling class methods:
*/
-static const struct sched_class fair_sched_class = {
+const struct sched_class fair_sched_class = {
.next = &idle_sched_class,
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
@@ -4985,7 +5597,7 @@ static const struct sched_class fair_sched_class = {
};
#ifdef CONFIG_SCHED_DEBUG
-static void print_cfs_stats(struct seq_file *m, int cpu)
+void print_cfs_stats(struct seq_file *m, int cpu)
{
struct cfs_rq *cfs_rq;
@@ -4995,3 +5607,16 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
rcu_read_unlock();
}
#endif
+
+__init void init_sched_fair_class(void)
+{
+#ifdef CONFIG_SMP
+ open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
+
+#ifdef CONFIG_NO_HZ
+ zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+ cpu_notifier(sched_ilb_notifier, 0);
+#endif
+#endif /* SMP */
+
+}
diff --git a/kernel/sched_features.h b/kernel/sched/features.h
index efa0a7b75dd..e61fd73913d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched/features.h
@@ -3,13 +3,13 @@
* them to run sooner, but does not allow tons of sleepers to
* rip the spread apart.
*/
-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
+SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
/*
* Place new tasks ahead so that they do not starve already running
* tasks
*/
-SCHED_FEAT(START_DEBIT, 1)
+SCHED_FEAT(START_DEBIT, true)
/*
* Based on load and program behaviour, see if it makes sense to place
@@ -17,53 +17,54 @@ SCHED_FEAT(START_DEBIT, 1)
* improve cache locality. Typically used with SYNC wakeups as
* generated by pipes and the like, see also SYNC_WAKEUPS.
*/
-SCHED_FEAT(AFFINE_WAKEUPS, 1)
+SCHED_FEAT(AFFINE_WAKEUPS, true)
/*
* Prefer to schedule the task we woke last (assuming it failed
* wakeup-preemption), since its likely going to consume data we
* touched, increases cache locality.
*/
-SCHED_FEAT(NEXT_BUDDY, 0)
+SCHED_FEAT(NEXT_BUDDY, false)
/*
* Prefer to schedule the task that ran last (when we did
* wake-preempt) as that likely will touch the same data, increases
* cache locality.
*/
-SCHED_FEAT(LAST_BUDDY, 1)
+SCHED_FEAT(LAST_BUDDY, true)
/*
* Consider buddies to be cache hot, decreases the likelyness of a
* cache buddy being migrated away, increases cache locality.
*/
-SCHED_FEAT(CACHE_HOT_BUDDY, 1)
+SCHED_FEAT(CACHE_HOT_BUDDY, true)
/*
* Use arch dependent cpu power functions
*/
-SCHED_FEAT(ARCH_POWER, 0)
+SCHED_FEAT(ARCH_POWER, false)
-SCHED_FEAT(HRTICK, 0)
-SCHED_FEAT(DOUBLE_TICK, 0)
-SCHED_FEAT(LB_BIAS, 1)
+SCHED_FEAT(HRTICK, false)
+SCHED_FEAT(DOUBLE_TICK, false)
+SCHED_FEAT(LB_BIAS, true)
/*
* Spin-wait on mutex acquisition when the mutex owner is running on
* another cpu -- assumes that when the owner is running, it will soon
* release the lock. Decreases scheduling overhead.
*/
-SCHED_FEAT(OWNER_SPIN, 1)
+SCHED_FEAT(OWNER_SPIN, true)
/*
* Decrement CPU power based on time not spent running tasks
*/
-SCHED_FEAT(NONTASK_POWER, 1)
+SCHED_FEAT(NONTASK_POWER, true)
/*
* Queue remote wakeups on the target CPU and process them
* using the scheduler IPI. Reduces rq->lock contention/bounces.
*/
-SCHED_FEAT(TTWU_QUEUE, 1)
+SCHED_FEAT(TTWU_QUEUE, true)
-SCHED_FEAT(FORCE_SD_OVERLAP, 0)
+SCHED_FEAT(FORCE_SD_OVERLAP, false)
+SCHED_FEAT(RT_RUNTIME_SHARE, true)
diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c
index 0a51882534e..91b4c957f28 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched/idle_task.c
@@ -1,3 +1,5 @@
+#include "sched.h"
+
/*
* idle-task scheduling class.
*
@@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
-static const struct sched_class idle_sched_class = {
+const struct sched_class idle_sched_class = {
/* .next is NULL */
/* no enqueue/yield_task for idle tasks */
diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c
index 056cbd2e2a2..f42ae7fb5ec 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched/rt.c
@@ -3,7 +3,92 @@
* policies)
*/
+#include "sched.h"
+
+#include <linux/slab.h>
+
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
+
+struct rt_bandwidth def_rt_bandwidth;
+
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+ struct rt_bandwidth *rt_b =
+ container_of(timer, struct rt_bandwidth, rt_period_timer);
+ ktime_t now;
+ int overrun;
+ int idle = 0;
+
+ for (;;) {
+ now = hrtimer_cb_get_time(timer);
+ overrun = hrtimer_forward(timer, now, rt_b->rt_period);
+
+ if (!overrun)
+ break;
+
+ idle = do_sched_rt_period_timer(rt_b, overrun);
+ }
+
+ return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
+{
+ rt_b->rt_period = ns_to_ktime(period);
+ rt_b->rt_runtime = runtime;
+
+ raw_spin_lock_init(&rt_b->rt_runtime_lock);
+
+ hrtimer_init(&rt_b->rt_period_timer,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ rt_b->rt_period_timer.function = sched_rt_period_timer;
+}
+
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+ return;
+
+ if (hrtimer_active(&rt_b->rt_period_timer))
+ return;
+
+ raw_spin_lock(&rt_b->rt_runtime_lock);
+ start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+ raw_spin_unlock(&rt_b->rt_runtime_lock);
+}
+
+void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+{
+ struct rt_prio_array *array;
+ int i;
+
+ array = &rt_rq->active;
+ for (i = 0; i < MAX_RT_PRIO; i++) {
+ INIT_LIST_HEAD(array->queue + i);
+ __clear_bit(i, array->bitmap);
+ }
+ /* delimiter for bitsearch: */
+ __set_bit(MAX_RT_PRIO, array->bitmap);
+
+#if defined CONFIG_SMP
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ rt_rq->highest_prio.next = MAX_RT_PRIO;
+ rt_rq->rt_nr_migratory = 0;
+ rt_rq->overloaded = 0;
+ plist_head_init(&rt_rq->pushable_tasks);
+#endif
+
+ rt_rq->rt_time = 0;
+ rt_rq->rt_throttled = 0;
+ rt_rq->rt_runtime = 0;
+ raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+}
+
#ifdef CONFIG_RT_GROUP_SCHED
+static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ hrtimer_cancel(&rt_b->rt_period_timer);
+}
#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
@@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
return rt_se->rt_rq;
}
+void free_rt_sched_group(struct task_group *tg)
+{
+ int i;
+
+ if (tg->rt_se)
+ destroy_rt_bandwidth(&tg->rt_bandwidth);
+
+ for_each_possible_cpu(i) {
+ if (tg->rt_rq)
+ kfree(tg->rt_rq[i]);
+ if (tg->rt_se)
+ kfree(tg->rt_se[i]);
+ }
+
+ kfree(tg->rt_rq);
+ kfree(tg->rt_se);
+}
+
+void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
+ struct sched_rt_entity *rt_se, int cpu,
+ struct sched_rt_entity *parent)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ rt_rq->rt_nr_boosted = 0;
+ rt_rq->rq = rq;
+ rt_rq->tg = tg;
+
+ tg->rt_rq[cpu] = rt_rq;
+ tg->rt_se[cpu] = rt_se;
+
+ if (!rt_se)
+ return;
+
+ if (!parent)
+ rt_se->rt_rq = &rq->rt;
+ else
+ rt_se->rt_rq = parent->my_q;
+
+ rt_se->my_q = rt_rq;
+ rt_se->parent = parent;
+ INIT_LIST_HEAD(&rt_se->run_list);
+}
+
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ struct rt_rq *rt_rq;
+ struct sched_rt_entity *rt_se;
+ int i;
+
+ tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
+ if (!tg->rt_rq)
+ goto err;
+ tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
+ if (!tg->rt_se)
+ goto err;
+
+ init_rt_bandwidth(&tg->rt_bandwidth,
+ ktime_to_ns(def_rt_bandwidth.rt_period), 0);
+
+ for_each_possible_cpu(i) {
+ rt_rq = kzalloc_node(sizeof(struct rt_rq),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!rt_rq)
+ goto err;
+
+ rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+ GFP_KERNEL, cpu_to_node(i));
+ if (!rt_se)
+ goto err_free_rq;
+
+ init_rt_rq(rt_rq, cpu_rq(i));
+ rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
+ init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
+ }
+
+ return 1;
+
+err_free_rq:
+ kfree(rt_rq);
+err:
+ return 0;
+}
+
#else /* CONFIG_RT_GROUP_SCHED */
#define rt_entity_is_task(rt_se) (1)
@@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
return &rq->rt;
}
+void free_rt_sched_group(struct task_group *tg) { }
+
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+{
+ return 1;
+}
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_SMP
@@ -556,10 +732,35 @@ static void enable_runtime(struct rq *rq)
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
+int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+ int cpu = (int)(long)hcpu;
+
+ switch (action) {
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ disable_runtime(cpu_rq(cpu));
+ return NOTIFY_OK;
+
+ case CPU_DOWN_FAILED:
+ case CPU_DOWN_FAILED_FROZEN:
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ enable_runtime(cpu_rq(cpu));
+ return NOTIFY_OK;
+
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
static int balance_runtime(struct rt_rq *rt_rq)
{
int more = 0;
+ if (!sched_feat(RT_RUNTIME_SHARE))
+ return more;
+
if (rt_rq->rt_time > rt_rq->rt_runtime) {
raw_spin_unlock(&rt_rq->rt_runtime_lock);
more = do_balance_runtime(rt_rq);
@@ -645,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
if (rt_rq->rt_throttled)
return rt_rq_throttled(rt_rq);
- if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
+ if (runtime >= sched_rt_period(rt_rq))
return 0;
balance_runtime(rt_rq);
@@ -954,8 +1155,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
}
/*
- * Put task to the end of the run list without the overhead of dequeue
- * followed by enqueue.
+ * Put task to the head or the end of the run list without the overhead of
+ * dequeue followed by enqueue.
*/
static void
requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
@@ -999,6 +1200,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
cpu = task_cpu(p);
+ if (p->rt.nr_cpus_allowed == 1)
+ goto out;
+
/* For anything but wake ups, just return the task_cpu */
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
goto out;
@@ -1175,8 +1379,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
/* Only try algorithms three times */
#define RT_MAX_TRIES 3
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
-
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{
if (!task_running(rq, p) &&
@@ -1385,6 +1587,11 @@ static int push_rt_task(struct rq *rq)
if (!next_task)
return 0;
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ if (unlikely(task_running(rq, next_task)))
+ return 0;
+#endif
+
retry:
if (unlikely(next_task == rq->curr)) {
WARN_ON(1);
@@ -1650,13 +1857,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
pull_rt_task(rq);
}
-static inline void init_sched_rt_class(void)
+void init_sched_rt_class(void)
{
unsigned int i;
- for_each_possible_cpu(i)
+ for_each_possible_cpu(i) {
zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
GFP_KERNEL, cpu_to_node(i));
+ }
}
#endif /* CONFIG_SMP */
@@ -1797,7 +2005,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
return 0;
}
-static const struct sched_class rt_sched_class = {
+const struct sched_class rt_sched_class = {
.next = &fair_sched_class,
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
@@ -1832,7 +2040,7 @@ static const struct sched_class rt_sched_class = {
#ifdef CONFIG_SCHED_DEBUG
extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
-static void print_rt_stats(struct seq_file *m, int cpu)
+void print_rt_stats(struct seq_file *m, int cpu)
{
rt_rq_iter_t iter;
struct rt_rq *rt_rq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
new file mode 100644
index 00000000000..98c0c2623db
--- /dev/null
+++ b/kernel/sched/sched.h
@@ -0,0 +1,1166 @@
+
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/stop_machine.h>
+
+#include "cpupri.h"
+
+extern __read_mostly int scheduler_running;
+
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
+#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
+
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
+
+/*
+ * Helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+
+#define NICE_0_LOAD SCHED_LOAD_SCALE
+#define NICE_0_SHIFT SCHED_LOAD_SHIFT
+
+/*
+ * These are the 'tuning knobs' of the scheduler:
+ *
+ * default timeslice is 100 msecs (used only for SCHED_RR tasks).
+ * Timeslices get refilled after they expire.
+ */
+#define DEF_TIMESLICE (100 * HZ / 1000)
+
+/*
+ * single value that denotes runtime == period, ie unlimited time.
+ */
+#define RUNTIME_INF ((u64)~0ULL)
+
+static inline int rt_policy(int policy)
+{
+ if (policy == SCHED_FIFO || policy == SCHED_RR)
+ return 1;
+ return 0;
+}
+
+static inline int task_has_rt_policy(struct task_struct *p)
+{
+ return rt_policy(p->policy);
+}
+
+/*
+ * This is the priority-queue data structure of the RT scheduling class:
+ */
+struct rt_prio_array {
+ DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
+ struct list_head queue[MAX_RT_PRIO];
+};
+
+struct rt_bandwidth {
+ /* nests inside the rq lock: */
+ raw_spinlock_t rt_runtime_lock;
+ ktime_t rt_period;
+ u64 rt_runtime;
+ struct hrtimer rt_period_timer;
+};
+
+extern struct mutex sched_domains_mutex;
+
+#ifdef CONFIG_CGROUP_SCHED
+
+#include <linux/cgroup.h>
+
+struct cfs_rq;
+struct rt_rq;
+
+static LIST_HEAD(task_groups);
+
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+ raw_spinlock_t lock;
+ ktime_t period;
+ u64 quota, runtime;
+ s64 hierarchal_quota;
+ u64 runtime_expires;
+
+ int idle, timer_active;
+ struct hrtimer period_timer, slack_timer;
+ struct list_head throttled_cfs_rq;
+
+ /* statistics */
+ int nr_periods, nr_throttled;
+ u64 throttled_time;
+#endif
+};
+
+/* task group related information */
+struct task_group {
+ struct cgroup_subsys_state css;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* schedulable entities of this group on each cpu */
+ struct sched_entity **se;
+ /* runqueue "owned" by this group on each cpu */
+ struct cfs_rq **cfs_rq;
+ unsigned long shares;
+
+ atomic_t load_weight;
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct sched_rt_entity **rt_se;
+ struct rt_rq **rt_rq;
+
+ struct rt_bandwidth rt_bandwidth;
+#endif
+
+ struct rcu_head rcu;
+ struct list_head list;
+
+ struct task_group *parent;
+ struct list_head siblings;
+ struct list_head children;
+
+#ifdef CONFIG_SCHED_AUTOGROUP
+ struct autogroup *autogroup;
+#endif
+
+ struct cfs_bandwidth cfs_bandwidth;
+};
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
+
+/*
+ * A weight of 0 or 1 can cause arithmetics problems.
+ * A weight of a cfs_rq is the sum of weights of which entities
+ * are queued on this cfs_rq, so a weight of a entity should not be
+ * too large, so as the shares value of a task group.
+ * (The default weight is 1024 - so there's no practical
+ * limitation from this.)
+ */
+#define MIN_SHARES (1UL << 1)
+#define MAX_SHARES (1UL << 18)
+#endif
+
+/* Default task group.
+ * Every task in system belong to this group at bootup.
+ */
+extern struct task_group root_task_group;
+
+typedef int (*tg_visitor)(struct task_group *, void *);
+
+extern int walk_tg_tree_from(struct task_group *from,
+ tg_visitor down, tg_visitor up, void *data);
+
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+{
+ return walk_tg_tree_from(&root_task_group, down, up, data);
+}
+
+extern int tg_nop(struct task_group *tg, void *data);
+
+extern void free_fair_sched_group(struct task_group *tg);
+extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
+extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
+extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+ struct sched_entity *se, int cpu,
+ struct sched_entity *parent);
+extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+
+extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
+extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+
+extern void free_rt_sched_group(struct task_group *tg);
+extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
+extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
+ struct sched_rt_entity *rt_se, int cpu,
+ struct sched_rt_entity *parent);
+
+#else /* CONFIG_CGROUP_SCHED */
+
+struct cfs_bandwidth { };
+
+#endif /* CONFIG_CGROUP_SCHED */
+
+/* CFS-related fields in a runqueue */
+struct cfs_rq {
+ struct load_weight load;
+ unsigned long nr_running, h_nr_running;
+
+ u64 exec_clock;
+ u64 min_vruntime;
+#ifndef CONFIG_64BIT
+ u64 min_vruntime_copy;
+#endif
+
+ struct rb_root tasks_timeline;
+ struct rb_node *rb_leftmost;
+
+ struct list_head tasks;
+ struct list_head *balance_iterator;
+
+ /*
+ * 'curr' points to currently running entity on this cfs_rq.
+ * It is set to NULL otherwise (i.e when none are currently running).
+ */
+ struct sched_entity *curr, *next, *last, *skip;
+
+#ifdef CONFIG_SCHED_DEBUG
+ unsigned int nr_spread_over;
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
+
+ /*
+ * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+ * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+ * (like users, containers etc.)
+ *
+ * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+ * list is used during load balance.
+ */
+ int on_list;
+ struct list_head leaf_cfs_rq_list;
+ struct task_group *tg; /* group that "owns" this runqueue */
+
+#ifdef CONFIG_SMP
+ /*
+ * the part of load.weight contributed by tasks
+ */
+ unsigned long task_weight;
+
+ /*
+ * h_load = weight * f(tg)
+ *
+ * Where f(tg) is the recursive weight fraction assigned to
+ * this group.
+ */
+ unsigned long h_load;
+
+ /*
+ * Maintaining per-cpu shares distribution for group scheduling
+ *
+ * load_stamp is the last time we updated the load average
+ * load_last is the last time we updated the load average and saw load
+ * load_unacc_exec_time is currently unaccounted execution time
+ */
+ u64 load_avg;
+ u64 load_period;
+ u64 load_stamp, load_last, load_unacc_exec_time;
+
+ unsigned long load_contribution;
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_CFS_BANDWIDTH
+ int runtime_enabled;
+ u64 runtime_expires;
+ s64 runtime_remaining;
+
+ u64 throttled_timestamp;
+ int throttled, throttle_count;
+ struct list_head throttled_list;
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+};
+
+static inline int rt_bandwidth_enabled(void)
+{
+ return sysctl_sched_rt_runtime >= 0;
+}
+
+/* Real-Time classes' related field in a runqueue: */
+struct rt_rq {
+ struct rt_prio_array active;
+ unsigned long rt_nr_running;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+ struct {
+ int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
+ int next; /* next highest */
+#endif
+ } highest_prio;
+#endif
+#ifdef CONFIG_SMP
+ unsigned long rt_nr_migratory;
+ unsigned long rt_nr_total;
+ int overloaded;
+ struct plist_head pushable_tasks;
+#endif
+ int rt_throttled;
+ u64 rt_time;
+ u64 rt_runtime;
+ /* Nests inside the rq lock: */
+ raw_spinlock_t rt_runtime_lock;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ unsigned long rt_nr_boosted;
+
+ struct rq *rq;
+ struct list_head leaf_rt_rq_list;
+ struct task_group *tg;
+#endif
+};
+
+#ifdef CONFIG_SMP
+
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+ atomic_t refcount;
+ atomic_t rto_count;
+ struct rcu_head rcu;
+ cpumask_var_t span;
+ cpumask_var_t online;
+
+ /*
+ * The "RT overload" flag: it gets set if a CPU has more than
+ * one runnable RT task.
+ */
+ cpumask_var_t rto_mask;
+ struct cpupri cpupri;
+};
+
+extern struct root_domain def_root_domain;
+
+#endif /* CONFIG_SMP */
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ *
+ * Locking rule: those places that want to lock multiple runqueues
+ * (such as the load balancing or the thread migration code), lock
+ * acquire operations must be ordered by ascending &runqueue.
+ */
+struct rq {
+ /* runqueue lock: */
+ raw_spinlock_t lock;
+
+ /*
+ * nr_running and cpu_load should be in the same cacheline because
+ * remote CPUs use both these fields when doing load calculation.
+ */
+ unsigned long nr_running;
+ #define CPU_LOAD_IDX_MAX 5
+ unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+ unsigned long last_load_update_tick;
+#ifdef CONFIG_NO_HZ
+ u64 nohz_stamp;
+ unsigned long nohz_flags;
+#endif
+ int skip_clock_update;
+
+ /* capture load from *all* tasks on this cpu: */
+ struct load_weight load;
+ unsigned long nr_load_updates;
+ u64 nr_switches;
+
+ struct cfs_rq cfs;
+ struct rt_rq rt;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /* list of leaf cfs_rq on this cpu: */
+ struct list_head leaf_cfs_rq_list;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct list_head leaf_rt_rq_list;
+#endif
+
+ /*
+ * This is part of a global counter where only the total sum
+ * over all CPUs matters. A task can increase this counter on
+ * one CPU and if it got migrated afterwards it may decrease
+ * it on another CPU. Always updated under the runqueue lock:
+ */
+ unsigned long nr_uninterruptible;
+
+ struct task_struct *curr, *idle, *stop;
+ unsigned long next_balance;
+ struct mm_struct *prev_mm;
+
+ u64 clock;
+ u64 clock_task;
+
+ atomic_t nr_iowait;
+
+#ifdef CONFIG_SMP
+ struct root_domain *rd;
+ struct sched_domain *sd;
+
+ unsigned long cpu_power;
+
+ unsigned char idle_balance;
+ /* For active balancing */
+ int post_schedule;
+ int active_balance;
+ int push_cpu;
+ struct cpu_stop_work active_balance_work;
+ /* cpu of this runqueue: */
+ int cpu;
+ int online;
+
+ u64 rt_avg;
+ u64 age_stamp;
+ u64 idle_stamp;
+ u64 avg_idle;
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ u64 prev_irq_time;
+#endif
+#ifdef CONFIG_PARAVIRT
+ u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ u64 prev_steal_time_rq;
+#endif
+
+ /* calc_load related fields */
+ unsigned long calc_load_update;
+ long calc_load_active;
+
+#ifdef CONFIG_SCHED_HRTICK
+#ifdef CONFIG_SMP
+ int hrtick_csd_pending;
+ struct call_single_data hrtick_csd;
+#endif
+ struct hrtimer hrtick_timer;
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+ /* latency stats */
+ struct sched_info rq_sched_info;
+ unsigned long long rq_cpu_time;
+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+
+ /* sys_sched_yield() stats */
+ unsigned int yld_count;
+
+ /* schedule() stats */
+ unsigned int sched_switch;
+ unsigned int sched_count;
+ unsigned int sched_goidle;
+
+ /* try_to_wake_up() stats */
+ unsigned int ttwu_count;
+ unsigned int ttwu_local;
+#endif
+
+#ifdef CONFIG_SMP
+ struct llist_head wake_list;
+#endif
+};
+
+static inline int cpu_of(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+ return rq->cpu;
+#else
+ return 0;
+#endif
+}
+
+DECLARE_PER_CPU(struct rq, runqueues);
+
+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
+#define this_rq() (&__get_cpu_var(runqueues))
+#define task_rq(p) cpu_rq(task_cpu(p))
+#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+#define raw_rq() (&__raw_get_cpu_var(runqueues))
+
+#ifdef CONFIG_SMP
+
+#define rcu_dereference_check_sched_domain(p) \
+ rcu_dereference_check((p), \
+ lockdep_is_held(&sched_domains_mutex))
+
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+ __sd; __sd = __sd->parent)
+
+#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
+
+/**
+ * highest_flag_domain - Return highest sched_domain containing flag.
+ * @cpu: The cpu whose highest level of sched domain is to
+ * be returned.
+ * @flag: The flag to check for the highest sched_domain
+ * for the given cpu.
+ *
+ * Returns the highest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
+{
+ struct sched_domain *sd, *hsd = NULL;
+
+ for_each_domain(cpu, sd) {
+ if (!(sd->flags & flag))
+ break;
+ hsd = sd;
+ }
+
+ return hsd;
+}
+
+DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(int, sd_llc_id);
+
+#endif /* CONFIG_SMP */
+
+#include "stats.h"
+#include "auto_group.h"
+
+#ifdef CONFIG_CGROUP_SCHED
+
+/*
+ * Return the group to which this tasks belongs.
+ *
+ * We use task_subsys_state_check() and extend the RCU verification with
+ * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
+ * task it moves into the cgroup. Therefore by holding either of those locks,
+ * we pin the task to the current cgroup.
+ */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+ struct task_group *tg;
+ struct cgroup_subsys_state *css;
+
+ css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
+ lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(&task_rq(p)->lock));
+ tg = container_of(css, struct task_group, css);
+
+ return autogroup_task_group(p, tg);
+}
+
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+{
+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
+ struct task_group *tg = task_group(p);
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = tg->cfs_rq[cpu];
+ p->se.parent = tg->se[cpu];
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ p->rt.rt_rq = tg->rt_rq[cpu];
+ p->rt.parent = tg->rt_se[cpu];
+#endif
+}
+
+#else /* CONFIG_CGROUP_SCHED */
+
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+ return NULL;
+}
+
+#endif /* CONFIG_CGROUP_SCHED */
+
+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+ set_task_rq(p, cpu);
+#ifdef CONFIG_SMP
+ /*
+ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+ * successfuly executed on another CPU. We must ensure that updates of
+ * per-task data have been completed by this moment.
+ */
+ smp_wmb();
+ task_thread_info(p)->cpu = cpu;
+#endif
+}
+
+/*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+#ifdef CONFIG_SCHED_DEBUG
+# include <linux/jump_label.h>
+# define const_debug __read_mostly
+#else
+# define const_debug const
+#endif
+
+extern const_debug unsigned int sysctl_sched_features;
+
+#define SCHED_FEAT(name, enabled) \
+ __SCHED_FEAT_##name ,
+
+enum {
+#include "features.h"
+ __SCHED_FEAT_NR,
+};
+
+#undef SCHED_FEAT
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+static __always_inline bool static_branch__true(struct jump_label_key *key)
+{
+ return likely(static_branch(key)); /* Not out of line branch. */
+}
+
+static __always_inline bool static_branch__false(struct jump_label_key *key)
+{
+ return unlikely(static_branch(key)); /* Out of line branch. */
+}
+
+#define SCHED_FEAT(name, enabled) \
+static __always_inline bool static_branch_##name(struct jump_label_key *key) \
+{ \
+ return static_branch__##enabled(key); \
+}
+
+#include "features.h"
+
+#undef SCHED_FEAT
+
+extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR];
+#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
+#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
+#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+
+static inline u64 global_rt_period(void)
+{
+ return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+}
+
+static inline u64 global_rt_runtime(void)
+{
+ if (sysctl_sched_rt_runtime < 0)
+ return RUNTIME_INF;
+
+ return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
+}
+
+
+
+static inline int task_current(struct rq *rq, struct task_struct *p)
+{
+ return rq->curr == p;
+}
+
+static inline int task_running(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+ return p->on_cpu;
+#else
+ return task_current(rq, p);
+#endif
+}
+
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next) do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev) do { } while (0)
+#endif
+
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+ /*
+ * We can optimise this out completely for !SMP, because the
+ * SMP rebalancing from interrupt is the only thing that cares
+ * here.
+ */
+ next->on_cpu = 1;
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+ /*
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ */
+ smp_wmb();
+ prev->on_cpu = 0;
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
+ rq->lock.owner = current;
+#endif
+ /*
+ * If we are tracking spinlock dependencies then we have to
+ * fix up the runqueue lock - which gets 'carried over' from
+ * prev into current:
+ */
+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+
+ raw_spin_unlock_irq(&rq->lock);
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+ /*
+ * We can optimise this out completely for !SMP, because the
+ * SMP rebalancing from interrupt is the only thing that cares
+ * here.
+ */
+ next->on_cpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ raw_spin_unlock_irq(&rq->lock);
+#else
+ raw_spin_unlock(&rq->lock);
+#endif
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+ /*
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ */
+ smp_wmb();
+ prev->on_cpu = 0;
+#endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+
+
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+ lw->weight += inc;
+ lw->inv_weight = 0;
+}
+
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+ lw->weight -= dec;
+ lw->inv_weight = 0;
+}
+
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+ lw->weight = w;
+ lw->inv_weight = 0;
+}
+
+/*
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
+ * scaled version of the new time slice allocation that they receive on time
+ * slice expiry etc.
+ */
+
+#define WEIGHT_IDLEPRIO 3
+#define WMULT_IDLEPRIO 1431655765
+
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+static const int prio_to_weight[40] = {
+ /* -20 */ 88761, 71755, 56483, 46273, 36291,
+ /* -15 */ 29154, 23254, 18705, 14949, 11916,
+ /* -10 */ 9548, 7620, 6100, 4904, 3906,
+ /* -5 */ 3121, 2501, 1991, 1586, 1277,
+ /* 0 */ 1024, 820, 655, 526, 423,
+ /* 5 */ 335, 272, 215, 172, 137,
+ /* 10 */ 110, 87, 70, 56, 45,
+ /* 15 */ 36, 29, 23, 18, 15,
+};
+
+/*
+ * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+static const u32 prio_to_wmult[40] = {
+ /* -20 */ 48388, 59856, 76040, 92818, 118348,
+ /* -15 */ 147320, 184698, 229616, 287308, 360437,
+ /* -10 */ 449829, 563644, 704093, 875809, 1099582,
+ /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
+ /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
+ /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
+ /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
+ /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
+
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+ CPUACCT_STAT_USER, /* ... user mode */
+ CPUACCT_STAT_SYSTEM, /* ... kernel mode */
+
+ CPUACCT_STAT_NSTATS,
+};
+
+
+#define sched_class_highest (&stop_sched_class)
+#define for_each_class(class) \
+ for (class = sched_class_highest; class; class = class->next)
+
+extern const struct sched_class stop_sched_class;
+extern const struct sched_class rt_sched_class;
+extern const struct sched_class fair_sched_class;
+extern const struct sched_class idle_sched_class;
+
+
+#ifdef CONFIG_SMP
+
+extern void trigger_load_balance(struct rq *rq, int cpu);
+extern void idle_balance(int this_cpu, struct rq *this_rq);
+
+#else /* CONFIG_SMP */
+
+static inline void idle_balance(int cpu, struct rq *rq)
+{
+}
+
+#endif
+
+extern void sysrq_sched_debug_show(void);
+extern void sched_init_granularity(void);
+extern void update_max_interval(void);
+extern void update_group_power(struct sched_domain *sd, int cpu);
+extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
+extern void init_sched_rt_class(void);
+extern void init_sched_fair_class(void);
+
+extern void resched_task(struct task_struct *p);
+extern void resched_cpu(int cpu);
+
+extern struct rt_bandwidth def_rt_bandwidth;
+extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+
+extern void update_cpu_load(struct rq *this_rq);
+
+#ifdef CONFIG_CGROUP_CPUACCT
+#include <linux/cgroup.h>
+/* track cpu usage of a group of tasks and its child groups */
+struct cpuacct {
+ struct cgroup_subsys_state css;
+ /* cpuusage holds pointer to a u64-type object on every cpu */
+ u64 __percpu *cpuusage;
+ struct kernel_cpustat __percpu *cpustat;
+};
+
+/* return cpu accounting group corresponding to this container */
+static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
+ struct cpuacct, css);
+}
+
+/* return cpu accounting group to which this task belongs */
+static inline struct cpuacct *task_ca(struct task_struct *tsk)
+{
+ return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
+ struct cpuacct, css);
+}
+
+static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+{
+ if (!ca || !ca->css.cgroup->parent)
+ return NULL;
+ return cgroup_ca(ca->css.cgroup->parent);
+}
+
+extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+#else
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+#endif
+
+static inline void inc_nr_running(struct rq *rq)
+{
+ rq->nr_running++;
+}
+
+static inline void dec_nr_running(struct rq *rq)
+{
+ rq->nr_running--;
+}
+
+extern void update_rq_clock(struct rq *rq);
+
+extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
+extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
+
+extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+
+extern const_debug unsigned int sysctl_sched_time_avg;
+extern const_debug unsigned int sysctl_sched_nr_migrate;
+extern const_debug unsigned int sysctl_sched_migration_cost;
+
+static inline u64 sched_avg_period(void)
+{
+ return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
+}
+
+void calc_load_account_idle(struct rq *this_rq);
+
+#ifdef CONFIG_SCHED_HRTICK
+
+/*
+ * Use hrtick when:
+ * - enabled by features
+ * - hrtimer is actually high res
+ */
+static inline int hrtick_enabled(struct rq *rq)
+{
+ if (!sched_feat(HRTICK))
+ return 0;
+ if (!cpu_active(cpu_of(rq)))
+ return 0;
+ return hrtimer_is_hres_active(&rq->hrtick_timer);
+}
+
+void hrtick_start(struct rq *rq, u64 delay);
+
+#else
+
+static inline int hrtick_enabled(struct rq *rq)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SCHED_HRTICK */
+
+#ifdef CONFIG_SMP
+extern void sched_avg_update(struct rq *rq);
+static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+{
+ rq->rt_avg += rt_delta;
+ sched_avg_update(rq);
+}
+#else
+static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
+static inline void sched_avg_update(struct rq *rq) { }
+#endif
+
+extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_PREEMPT
+
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
+
+/*
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations. This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below. However, it
+ * also adds more overhead and therefore may reduce throughput.
+ */
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(this_rq->lock)
+ __acquires(busiest->lock)
+ __acquires(this_rq->lock)
+{
+ raw_spin_unlock(&this_rq->lock);
+ double_rq_lock(this_rq, busiest);
+
+ return 1;
+}
+
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry. This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(this_rq->lock)
+ __acquires(busiest->lock)
+ __acquires(this_rq->lock)
+{
+ int ret = 0;
+
+ if (unlikely(!raw_spin_trylock(&busiest->lock))) {
+ if (busiest < this_rq) {
+ raw_spin_unlock(&this_rq->lock);
+ raw_spin_lock(&busiest->lock);
+ raw_spin_lock_nested(&this_rq->lock,
+ SINGLE_DEPTH_NESTING);
+ ret = 1;
+ } else
+ raw_spin_lock_nested(&busiest->lock,
+ SINGLE_DEPTH_NESTING);
+ }
+ return ret;
+}
+
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+ if (unlikely(!irqs_disabled())) {
+ /* printk() doesn't work good under rq->lock */
+ raw_spin_unlock(&this_rq->lock);
+ BUG_ON(1);
+ }
+
+ return _double_lock_balance(this_rq, busiest);
+}
+
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+ __releases(busiest->lock)
+{
+ raw_spin_unlock(&busiest->lock);
+ lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ if (rq1 == rq2) {
+ raw_spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+ } else {
+ if (rq1 < rq2) {
+ raw_spin_lock(&rq1->lock);
+ raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+ } else {
+ raw_spin_lock(&rq2->lock);
+ raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+ }
+ }
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ raw_spin_unlock(&rq1->lock);
+ if (rq1 != rq2)
+ raw_spin_unlock(&rq2->lock);
+ else
+ __release(rq2->lock);
+}
+
+#else /* CONFIG_SMP */
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ BUG_ON(rq1 != rq2);
+ raw_spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ BUG_ON(rq1 != rq2);
+ raw_spin_unlock(&rq1->lock);
+ __release(rq2->lock);
+}
+
+#endif
+
+extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
+extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+extern void print_cfs_stats(struct seq_file *m, int cpu);
+extern void print_rt_stats(struct seq_file *m, int cpu);
+
+extern void init_cfs_rq(struct cfs_rq *cfs_rq);
+extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
+extern void unthrottle_offline_cfs_rqs(struct rq *rq);
+
+extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
+
+#ifdef CONFIG_NO_HZ
+enum rq_nohz_flag_bits {
+ NOHZ_TICK_STOPPED,
+ NOHZ_BALANCE_KICK,
+ NOHZ_IDLE,
+};
+
+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
+#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
new file mode 100644
index 00000000000..2a581ba8e19
--- /dev/null
+++ b/kernel/sched/stats.c
@@ -0,0 +1,111 @@
+
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "sched.h"
+
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION 15
+
+static int show_schedstat(struct seq_file *seq, void *v)
+{
+ int cpu;
+ int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
+ char *mask_str = kmalloc(mask_len, GFP_KERNEL);
+
+ if (mask_str == NULL)
+ return -ENOMEM;
+
+ seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+ seq_printf(seq, "timestamp %lu\n", jiffies);
+ for_each_online_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_SMP
+ struct sched_domain *sd;
+ int dcount = 0;
+#endif
+
+ /* runqueue-specific stats */
+ seq_printf(seq,
+ "cpu%d %u %u %u %u %u %u %llu %llu %lu",
+ cpu, rq->yld_count,
+ rq->sched_switch, rq->sched_count, rq->sched_goidle,
+ rq->ttwu_count, rq->ttwu_local,
+ rq->rq_cpu_time,
+ rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+
+ seq_printf(seq, "\n");
+
+#ifdef CONFIG_SMP
+ /* domain-specific stats */
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ enum cpu_idle_type itype;
+
+ cpumask_scnprintf(mask_str, mask_len,
+ sched_domain_span(sd));
+ seq_printf(seq, "domain%d %s", dcount++, mask_str);
+ for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
+ itype++) {
+ seq_printf(seq, " %u %u %u %u %u %u %u %u",
+ sd->lb_count[itype],
+ sd->lb_balanced[itype],
+ sd->lb_failed[itype],
+ sd->lb_imbalance[itype],
+ sd->lb_gained[itype],
+ sd->lb_hot_gained[itype],
+ sd->lb_nobusyq[itype],
+ sd->lb_nobusyg[itype]);
+ }
+ seq_printf(seq,
+ " %u %u %u %u %u %u %u %u %u %u %u %u\n",
+ sd->alb_count, sd->alb_failed, sd->alb_pushed,
+ sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
+ sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
+ sd->ttwu_wake_remote, sd->ttwu_move_affine,
+ sd->ttwu_move_balance);
+ }
+ rcu_read_unlock();
+#endif
+ }
+ kfree(mask_str);
+ return 0;
+}
+
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+ unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+ char *buf = kmalloc(size, GFP_KERNEL);
+ struct seq_file *m;
+ int res;
+
+ if (!buf)
+ return -ENOMEM;
+ res = single_open(file, show_schedstat, NULL);
+ if (!res) {
+ m = file->private_data;
+ m->buf = buf;
+ m->size = size;
+ } else
+ kfree(buf);
+ return res;
+}
+
+static const struct file_operations proc_schedstat_operations = {
+ .open = schedstat_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init proc_schedstat_init(void)
+{
+ proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
+ return 0;
+}
+module_init(proc_schedstat_init);
diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h
index 87f9e36ea56..2ef90a51ec5 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched/stats.h
@@ -1,108 +1,5 @@
#ifdef CONFIG_SCHEDSTATS
-/*
- * bump this up when changing the output format or the meaning of an existing
- * format, so that tools can adapt (or abort)
- */
-#define SCHEDSTAT_VERSION 15
-
-static int show_schedstat(struct seq_file *seq, void *v)
-{
- int cpu;
- int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
- char *mask_str = kmalloc(mask_len, GFP_KERNEL);
-
- if (mask_str == NULL)
- return -ENOMEM;
-
- seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
- seq_printf(seq, "timestamp %lu\n", jiffies);
- for_each_online_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
-#ifdef CONFIG_SMP
- struct sched_domain *sd;
- int dcount = 0;
-#endif
-
- /* runqueue-specific stats */
- seq_printf(seq,
- "cpu%d %u %u %u %u %u %u %llu %llu %lu",
- cpu, rq->yld_count,
- rq->sched_switch, rq->sched_count, rq->sched_goidle,
- rq->ttwu_count, rq->ttwu_local,
- rq->rq_cpu_time,
- rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
-
- seq_printf(seq, "\n");
-
-#ifdef CONFIG_SMP
- /* domain-specific stats */
- rcu_read_lock();
- for_each_domain(cpu, sd) {
- enum cpu_idle_type itype;
-
- cpumask_scnprintf(mask_str, mask_len,
- sched_domain_span(sd));
- seq_printf(seq, "domain%d %s", dcount++, mask_str);
- for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
- itype++) {
- seq_printf(seq, " %u %u %u %u %u %u %u %u",
- sd->lb_count[itype],
- sd->lb_balanced[itype],
- sd->lb_failed[itype],
- sd->lb_imbalance[itype],
- sd->lb_gained[itype],
- sd->lb_hot_gained[itype],
- sd->lb_nobusyq[itype],
- sd->lb_nobusyg[itype]);
- }
- seq_printf(seq,
- " %u %u %u %u %u %u %u %u %u %u %u %u\n",
- sd->alb_count, sd->alb_failed, sd->alb_pushed,
- sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
- sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
- sd->ttwu_wake_remote, sd->ttwu_move_affine,
- sd->ttwu_move_balance);
- }
- rcu_read_unlock();
-#endif
- }
- kfree(mask_str);
- return 0;
-}
-
-static int schedstat_open(struct inode *inode, struct file *file)
-{
- unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
- char *buf = kmalloc(size, GFP_KERNEL);
- struct seq_file *m;
- int res;
-
- if (!buf)
- return -ENOMEM;
- res = single_open(file, show_schedstat, NULL);
- if (!res) {
- m = file->private_data;
- m->buf = buf;
- m->size = size;
- } else
- kfree(buf);
- return res;
-}
-
-static const struct file_operations proc_schedstat_operations = {
- .open = schedstat_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
-static int __init proc_schedstat_init(void)
-{
- proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
- return 0;
-}
-module_init(proc_schedstat_init);
/*
* Expects runqueue lock to be held for atomicity of update
@@ -283,8 +180,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
return;
raw_spin_lock(&cputimer->lock);
- cputimer->cputime.utime =
- cputime_add(cputimer->cputime.utime, cputime);
+ cputimer->cputime.utime += cputime;
raw_spin_unlock(&cputimer->lock);
}
@@ -307,8 +203,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
return;
raw_spin_lock(&cputimer->lock);
- cputimer->cputime.stime =
- cputime_add(cputimer->cputime.stime, cputime);
+ cputimer->cputime.stime += cputime;
raw_spin_unlock(&cputimer->lock);
}
diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c
index 8b44e7fa7fb..7b386e86fd2 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched/stop_task.c
@@ -1,3 +1,5 @@
+#include "sched.h"
+
/*
* stop-task scheduling class.
*
@@ -80,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
-static const struct sched_class stop_sched_class = {
+const struct sched_class stop_sched_class = {
.next = &rt_sched_class,
.enqueue_task = enqueue_task_stop,
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 57d4b13b631..e8d76c5895e 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -6,6 +6,7 @@
* This defines a simple but solid secure-computing mode.
*/
+#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/sched.h>
#include <linux/compat.h>
@@ -54,6 +55,7 @@ void __secure_computing(int this_syscall)
#ifdef SECCOMP_DEBUG
dump_stack();
#endif
+ audit_seccomp(this_syscall);
do_exit(SIGKILL);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index b3f78d09a10..c73c4284160 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -28,6 +28,7 @@
#include <linux/freezer.h>
#include <linux/pid_namespace.h>
#include <linux/nsproxy.h>
+#include <linux/user_namespace.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}
+/*
+ * map the uid in struct cred into user namespace *ns
+ */
+static inline uid_t map_cred_ns(const struct cred *cred,
+ struct user_namespace *ns)
+{
+ return user_ns_map_uid(ns, cred, cred->uid);
+}
+
+#ifdef CONFIG_USER_NS
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+ if (current_user_ns() == task_cred_xxx(t, user_ns))
+ return;
+
+ if (SI_FROMKERNEL(info))
+ return;
+
+ info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns),
+ current_cred(), info->si_uid);
+}
+#else
+static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t)
+{
+ return;
+}
+#endif
+
static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
int group, int from_ancestor_ns)
{
@@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
q->info.si_pid = 0;
break;
}
+
+ userns_fixup_signal_uid(&q->info, t);
+
} else if (!is_si_special(info)) {
if (sig >= SIGRTMIN && info->si_code != SI_USER) {
/*
@@ -1626,13 +1658,12 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
*/
rcu_read_lock();
info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
- info.si_uid = __task_cred(tsk)->uid;
+ info.si_uid = map_cred_ns(__task_cred(tsk),
+ task_cred_xxx(tsk->parent, user_ns));
rcu_read_unlock();
- info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
- tsk->signal->utime));
- info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
- tsk->signal->stime));
+ info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
+ info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
info.si_status = tsk->exit_code & 0x7f;
if (tsk->exit_code & 0x80)
@@ -1711,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
*/
rcu_read_lock();
info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
- info.si_uid = __task_cred(tsk)->uid;
+ info.si_uid = map_cred_ns(__task_cred(tsk),
+ task_cred_xxx(parent, user_ns));
rcu_read_unlock();
info.si_utime = cputime_to_clock_t(tsk->utime);
@@ -1994,8 +2026,6 @@ static bool do_signal_stop(int signr)
*/
if (!(sig->flags & SIGNAL_STOP_STOPPED))
sig->group_exit_code = signr;
- else
- WARN_ON_ONCE(!current->ptrace);
sig->group_stop_count = 0;
@@ -2129,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info,
info->si_signo = signr;
info->si_errno = 0;
info->si_code = SI_USER;
+ rcu_read_lock();
info->si_pid = task_pid_vnr(current->parent);
- info->si_uid = task_uid(current->parent);
+ info->si_uid = map_cred_ns(__task_cred(current->parent),
+ current_user_ns());
+ rcu_read_unlock();
}
/* If the (new) signal is now blocked, requeue it. */
@@ -2322,6 +2355,27 @@ relock:
return signr;
}
+/**
+ * block_sigmask - add @ka's signal mask to current->blocked
+ * @ka: action for @signr
+ * @signr: signal that has been successfully delivered
+ *
+ * This function should be called when a signal has succesfully been
+ * delivered. It adds the mask of signals for @ka to current->blocked
+ * so that they are blocked during the execution of the signal
+ * handler. In addition, @signr will be blocked unless %SA_NODEFER is
+ * set in @ka->sa.sa_flags.
+ */
+void block_sigmask(struct k_sigaction *ka, int signr)
+{
+ sigset_t blocked;
+
+ sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
+ if (!(ka->sa.sa_flags & SA_NODEFER))
+ sigaddset(&blocked, signr);
+ set_current_blocked(&blocked);
+}
+
/*
* It could be that complete_signal() picked us to notify about the
* group-wide signal. Other threads should be notified now to take
@@ -2359,8 +2413,15 @@ void exit_signals(struct task_struct *tsk)
int group_stop = 0;
sigset_t unblocked;
+ /*
+ * @tsk is about to have PF_EXITING set - lock out users which
+ * expect stable threadgroup.
+ */
+ threadgroup_change_begin(tsk);
+
if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
tsk->flags |= PF_EXITING;
+ threadgroup_change_end(tsk);
return;
}
@@ -2370,6 +2431,9 @@ void exit_signals(struct task_struct *tsk)
* see wants_signal(), do_signal_stop().
*/
tsk->flags |= PF_EXITING;
+
+ threadgroup_change_end(tsk);
+
if (!signal_pending(tsk))
goto out;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2c71d91efff..4eb3a0fa351 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -347,12 +347,12 @@ void irq_exit(void)
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
- rcu_irq_exit();
#ifdef CONFIG_NO_HZ
/* Make sure that timer wheel updates are propagated */
if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
- tick_nohz_stop_sched_tick(0);
+ tick_nohz_irq_exit();
#endif
+ rcu_irq_exit();
preempt_enable_no_resched();
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 481611fbd07..40701538fbd 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1605,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
unsigned long maxrss = 0;
memset((char *) r, 0, sizeof *r);
- utime = stime = cputime_zero;
+ utime = stime = 0;
if (who == RUSAGE_THREAD) {
task_times(current, &utime, &stime);
@@ -1635,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
case RUSAGE_SELF:
thread_group_times(p, &tgutime, &tgstime);
- utime = cputime_add(utime, tgutime);
- stime = cputime_add(stime, tgstime);
+ utime += tgutime;
+ stime += tgstime;
r->ru_nvcsw += p->signal->nvcsw;
r->ru_nivcsw += p->signal->nivcsw;
r->ru_minflt += p->signal->min_flt;
@@ -1692,6 +1692,124 @@ SYSCALL_DEFINE1(umask, int, mask)
return mask;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
+static int prctl_set_mm(int opt, unsigned long addr,
+ unsigned long arg4, unsigned long arg5)
+{
+ unsigned long rlim = rlimit(RLIMIT_DATA);
+ unsigned long vm_req_flags;
+ unsigned long vm_bad_flags;
+ struct vm_area_struct *vma;
+ int error = 0;
+ struct mm_struct *mm = current->mm;
+
+ if (arg4 | arg5)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (addr >= TASK_SIZE)
+ return -EINVAL;
+
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, addr);
+
+ if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
+ /* It must be existing VMA */
+ if (!vma || vma->vm_start > addr)
+ goto out;
+ }
+
+ error = -EINVAL;
+ switch (opt) {
+ case PR_SET_MM_START_CODE:
+ case PR_SET_MM_END_CODE:
+ vm_req_flags = VM_READ | VM_EXEC;
+ vm_bad_flags = VM_WRITE | VM_MAYSHARE;
+
+ if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+ (vma->vm_flags & vm_bad_flags))
+ goto out;
+
+ if (opt == PR_SET_MM_START_CODE)
+ mm->start_code = addr;
+ else
+ mm->end_code = addr;
+ break;
+
+ case PR_SET_MM_START_DATA:
+ case PR_SET_MM_END_DATA:
+ vm_req_flags = VM_READ | VM_WRITE;
+ vm_bad_flags = VM_EXEC | VM_MAYSHARE;
+
+ if ((vma->vm_flags & vm_req_flags) != vm_req_flags ||
+ (vma->vm_flags & vm_bad_flags))
+ goto out;
+
+ if (opt == PR_SET_MM_START_DATA)
+ mm->start_data = addr;
+ else
+ mm->end_data = addr;
+ break;
+
+ case PR_SET_MM_START_STACK:
+
+#ifdef CONFIG_STACK_GROWSUP
+ vm_req_flags = VM_READ | VM_WRITE | VM_GROWSUP;
+#else
+ vm_req_flags = VM_READ | VM_WRITE | VM_GROWSDOWN;
+#endif
+ if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
+ goto out;
+
+ mm->start_stack = addr;
+ break;
+
+ case PR_SET_MM_START_BRK:
+ if (addr <= mm->end_data)
+ goto out;
+
+ if (rlim < RLIM_INFINITY &&
+ (mm->brk - addr) +
+ (mm->end_data - mm->start_data) > rlim)
+ goto out;
+
+ mm->start_brk = addr;
+ break;
+
+ case PR_SET_MM_BRK:
+ if (addr <= mm->end_data)
+ goto out;
+
+ if (rlim < RLIM_INFINITY &&
+ (addr - mm->start_brk) +
+ (mm->end_data - mm->start_data) > rlim)
+ goto out;
+
+ mm->brk = addr;
+ break;
+
+ default:
+ error = -EINVAL;
+ goto out;
+ }
+
+ error = 0;
+
+out:
+ up_read(&mm->mmap_sem);
+
+ return error;
+}
+#else /* CONFIG_CHECKPOINT_RESTORE */
+static int prctl_set_mm(int opt, unsigned long addr,
+ unsigned long arg4, unsigned long arg5)
+{
+ return -EINVAL;
+}
+#endif
+
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -1841,6 +1959,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
else
error = PR_MCE_KILL_DEFAULT;
break;
+ case PR_SET_MM:
+ error = prctl_set_mm(arg2, arg3, arg4, arg5);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae271964385..f487f257e05 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -803,6 +803,15 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+ {
+ .procname = "panic_on_stackoverflow",
+ .data = &sysctl_panic_on_stackoverflow,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
{
.procname = "bootloader_type",
.data = &bootloader_type,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 6318b511afa..a650694883a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1354,7 +1354,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
fput(file);
out_putname:
- putname(pathname);
+ __putname(pathname);
out:
return result;
}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index b26c2228fe9..2cf9cc7aa10 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -25,7 +25,7 @@ config HIGH_RES_TIMERS
config GENERIC_CLOCKEVENTS_BUILD
bool
default y
- depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR
+ depends on GENERIC_CLOCKEVENTS
config GENERIC_CLOCKEVENTS_MIN_ADJUST
bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c436e790b21..8a46f5d6450 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -195,7 +195,7 @@ static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
struct alarm *alarm;
ktime_t expired = next->expires;
- if (expired.tv64 >= now.tv64)
+ if (expired.tv64 > now.tv64)
break;
alarm = container_of(next, struct alarm, node);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1ecd6ba36d6..9cd928f7a7c 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -17,7 +17,6 @@
#include <linux/module.h>
#include <linux/notifier.h>
#include <linux/smp.h>
-#include <linux/sysdev.h>
#include "tick-internal.h"
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index cf52fda2e09..a45ca167ab2 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -23,8 +23,8 @@
* o Allow clocksource drivers to be unregistered
*/
+#include <linux/device.h>
#include <linux/clocksource.h>
-#include <linux/sysdev.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
@@ -492,6 +492,22 @@ void clocksource_touch_watchdog(void)
}
/**
+ * clocksource_max_adjustment- Returns max adjustment amount
+ * @cs: Pointer to clocksource
+ *
+ */
+static u32 clocksource_max_adjustment(struct clocksource *cs)
+{
+ u64 ret;
+ /*
+ * We won't try to correct for more then 11% adjustments (110,000 ppm),
+ */
+ ret = (u64)cs->mult * 11;
+ do_div(ret,100);
+ return (u32)ret;
+}
+
+/**
* clocksource_max_deferment - Returns max time the clocksource can be deferred
* @cs: Pointer to clocksource
*
@@ -503,25 +519,28 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
/*
* Calculate the maximum number of cycles that we can pass to the
* cyc2ns function without overflowing a 64-bit signed result. The
- * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
- * is equivalent to the below.
- * max_cycles < (2^63)/cs->mult
- * max_cycles < 2^(log2((2^63)/cs->mult))
- * max_cycles < 2^(log2(2^63) - log2(cs->mult))
- * max_cycles < 2^(63 - log2(cs->mult))
- * max_cycles < 1 << (63 - log2(cs->mult))
+ * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj)
+ * which is equivalent to the below.
+ * max_cycles < (2^63)/(cs->mult + cs->maxadj)
+ * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj)))
+ * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj))
+ * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj))
+ * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj))
* Please note that we add 1 to the result of the log2 to account for
* any rounding errors, ensure the above inequality is satisfied and
* no overflow will occur.
*/
- max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
+ max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1));
/*
* The actual maximum number of cycles we can defer the clocksource is
* determined by the minimum of max_cycles and cs->mask.
+ * Note: Here we subtract the maxadj to make sure we don't sleep for
+ * too long if there's a large negative adjustment.
*/
max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
- max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
+ max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj,
+ cs->shift);
/*
* To ensure that the clocksource does not wrap whilst we are idle,
@@ -529,7 +548,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs)
* note a margin of 12.5% is used because this can be computed with
* a shift, versus say 10% which would require division.
*/
- return max_nsecs - (max_nsecs >> 5);
+ return max_nsecs - (max_nsecs >> 3);
}
#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
@@ -628,7 +647,7 @@ static void clocksource_enqueue(struct clocksource *cs)
/**
* __clocksource_updatefreq_scale - Used update clocksource with new freq
- * @t: clocksource to be registered
+ * @cs: clocksource to be registered
* @scale: Scale factor multiplied against freq to get clocksource hz
* @freq: clocksource frequency (cycles per second) divided by scale
*
@@ -640,7 +659,6 @@ static void clocksource_enqueue(struct clocksource *cs)
void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
{
u64 sec;
-
/*
* Calc the maximum number of seconds which we can run before
* wrapping around. For clocksources which have a mask > 32bit
@@ -651,7 +669,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
* ~ 0.06ppm granularity for NTP. We apply the same 12.5%
* margin as we do in clocksource_max_deferment()
*/
- sec = (cs->mask - (cs->mask >> 5));
+ sec = (cs->mask - (cs->mask >> 3));
do_div(sec, freq);
do_div(sec, scale);
if (!sec)
@@ -661,13 +679,27 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
NSEC_PER_SEC / scale, sec * scale);
+
+ /*
+ * for clocksources that have large mults, to avoid overflow.
+ * Since mult may be adjusted by ntp, add an safety extra margin
+ *
+ */
+ cs->maxadj = clocksource_max_adjustment(cs);
+ while ((cs->mult + cs->maxadj < cs->mult)
+ || (cs->mult - cs->maxadj > cs->mult)) {
+ cs->mult >>= 1;
+ cs->shift--;
+ cs->maxadj = clocksource_max_adjustment(cs);
+ }
+
cs->max_idle_ns = clocksource_max_deferment(cs);
}
EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
/**
* __clocksource_register_scale - Used to install new clocksources
- * @t: clocksource to be registered
+ * @cs: clocksource to be registered
* @scale: Scale factor multiplied against freq to get clocksource hz
* @freq: clocksource frequency (cycles per second) divided by scale
*
@@ -695,12 +727,18 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale);
/**
* clocksource_register - Used to install new clocksources
- * @t: clocksource to be registered
+ * @cs: clocksource to be registered
*
* Returns -EBUSY if registration fails, zero otherwise.
*/
int clocksource_register(struct clocksource *cs)
{
+ /* calculate max adjustment for given mult/shift */
+ cs->maxadj = clocksource_max_adjustment(cs);
+ WARN_ONCE(cs->mult + cs->maxadj < cs->mult,
+ "Clocksource %s might overflow on 11%% adjustment\n",
+ cs->name);
+
/* calculate max idle time permitted for this clocksource */
cs->max_idle_ns = clocksource_max_deferment(cs);
@@ -723,6 +761,8 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating)
/**
* clocksource_change_rating - Change the rating of a registered clocksource
+ * @cs: clocksource to be changed
+ * @rating: new rating
*/
void clocksource_change_rating(struct clocksource *cs, int rating)
{
@@ -734,6 +774,7 @@ EXPORT_SYMBOL(clocksource_change_rating);
/**
* clocksource_unregister - remove a registered clocksource
+ * @cs: clocksource to be unregistered
*/
void clocksource_unregister(struct clocksource *cs)
{
@@ -749,13 +790,14 @@ EXPORT_SYMBOL(clocksource_unregister);
/**
* sysfs_show_current_clocksources - sysfs interface for current clocksource
* @dev: unused
+ * @attr: unused
* @buf: char buffer to be filled with clocksource list
*
* Provides sysfs interface for listing current clocksource.
*/
static ssize_t
-sysfs_show_current_clocksources(struct sys_device *dev,
- struct sysdev_attribute *attr, char *buf)
+sysfs_show_current_clocksources(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
ssize_t count = 0;
@@ -769,14 +811,15 @@ sysfs_show_current_clocksources(struct sys_device *dev,
/**
* sysfs_override_clocksource - interface for manually overriding clocksource
* @dev: unused
+ * @attr: unused
* @buf: name of override clocksource
* @count: length of buffer
*
* Takes input from sysfs interface for manually overriding the default
* clocksource selection.
*/
-static ssize_t sysfs_override_clocksource(struct sys_device *dev,
- struct sysdev_attribute *attr,
+static ssize_t sysfs_override_clocksource(struct device *dev,
+ struct device_attribute *attr,
const char *buf, size_t count)
{
size_t ret = count;
@@ -804,13 +847,14 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
/**
* sysfs_show_available_clocksources - sysfs interface for listing clocksource
* @dev: unused
+ * @attr: unused
* @buf: char buffer to be filled with clocksource list
*
* Provides sysfs interface for listing registered clocksources
*/
static ssize_t
-sysfs_show_available_clocksources(struct sys_device *dev,
- struct sysdev_attribute *attr,
+sysfs_show_available_clocksources(struct device *dev,
+ struct device_attribute *attr,
char *buf)
{
struct clocksource *src;
@@ -839,35 +883,36 @@ sysfs_show_available_clocksources(struct sys_device *dev,
/*
* Sysfs setup bits:
*/
-static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
+static DEVICE_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources,
sysfs_override_clocksource);
-static SYSDEV_ATTR(available_clocksource, 0444,
+static DEVICE_ATTR(available_clocksource, 0444,
sysfs_show_available_clocksources, NULL);
-static struct sysdev_class clocksource_sysclass = {
+static struct bus_type clocksource_subsys = {
.name = "clocksource",
+ .dev_name = "clocksource",
};
-static struct sys_device device_clocksource = {
+static struct device device_clocksource = {
.id = 0,
- .cls = &clocksource_sysclass,
+ .bus = &clocksource_subsys,
};
static int __init init_clocksource_sysfs(void)
{
- int error = sysdev_class_register(&clocksource_sysclass);
+ int error = subsys_system_register(&clocksource_subsys, NULL);
if (!error)
- error = sysdev_register(&device_clocksource);
+ error = device_register(&device_clocksource);
if (!error)
- error = sysdev_create_file(
+ error = device_create_file(
&device_clocksource,
- &attr_current_clocksource);
+ &dev_attr_current_clocksource);
if (!error)
- error = sysdev_create_file(
+ error = device_create_file(
&device_clocksource,
- &attr_available_clocksource);
+ &dev_attr_available_clocksource);
return error;
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f954282d9a8..fd4a7b1625a 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -71,7 +71,7 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
(dev->features & CLOCK_EVT_FEAT_C3STOP))
return 0;
- clockevents_exchange_device(NULL, dev);
+ clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
tick_broadcast_device.evtdev = dev;
if (!cpumask_empty(tick_get_broadcast_mask()))
tick_broadcast_start_periodic(dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 40420644d0b..7656642e4b8 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -275,42 +275,17 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
}
EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
-/**
- * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
- *
- * When the next event is more than a tick into the future, stop the idle tick
- * Called either from the idle loop or from irq_exit() when an idle period was
- * just interrupted by an interrupt which did not cause a reschedule.
- */
-void tick_nohz_stop_sched_tick(int inidle)
+static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
{
- unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
- struct tick_sched *ts;
+ unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
ktime_t last_update, expires, now;
struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
u64 time_delta;
int cpu;
- local_irq_save(flags);
-
cpu = smp_processor_id();
ts = &per_cpu(tick_cpu_sched, cpu);
- /*
- * Call to tick_nohz_start_idle stops the last_update_time from being
- * updated. Thus, it must not be called in the event we are called from
- * irq_exit() with the prior state different than idle.
- */
- if (!inidle && !ts->inidle)
- goto end;
-
- /*
- * Set ts->inidle unconditionally. Even if the system did not
- * switch to NOHZ mode the cpu frequency governers rely on the
- * update of the idle time accounting in tick_nohz_start_idle().
- */
- ts->inidle = 1;
-
now = tick_nohz_start_idle(cpu, ts);
/*
@@ -326,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle)
}
if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
- goto end;
+ return;
if (need_resched())
- goto end;
+ return;
if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
static int ratelimit;
@@ -339,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle)
(unsigned int) local_softirq_pending());
ratelimit++;
}
- goto end;
+ return;
}
ts->idle_calls++;
@@ -434,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle)
ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
ts->idle_jiffies = last_jiffies;
- rcu_enter_nohz();
}
ts->idle_sleeps++;
@@ -472,8 +446,64 @@ out:
ts->next_jiffies = next_jiffies;
ts->last_jiffies = last_jiffies;
ts->sleep_length = ktime_sub(dev->next_event, now);
-end:
- local_irq_restore(flags);
+}
+
+/**
+ * tick_nohz_idle_enter - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ * Called when we start the idle loop.
+ *
+ * The arch is responsible of calling:
+ *
+ * - rcu_idle_enter() after its last use of RCU before the CPU is put
+ * to sleep.
+ * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
+ */
+void tick_nohz_idle_enter(void)
+{
+ struct tick_sched *ts;
+
+ WARN_ON_ONCE(irqs_disabled());
+
+ /*
+ * Update the idle state in the scheduler domain hierarchy
+ * when tick_nohz_stop_sched_tick() is called from the idle loop.
+ * State will be updated to busy during the first busy tick after
+ * exiting idle.
+ */
+ set_cpu_sd_state_idle();
+
+ local_irq_disable();
+
+ ts = &__get_cpu_var(tick_cpu_sched);
+ /*
+ * set ts->inidle unconditionally. even if the system did not
+ * switch to nohz mode the cpu frequency governers rely on the
+ * update of the idle time accounting in tick_nohz_start_idle().
+ */
+ ts->inidle = 1;
+ tick_nohz_stop_sched_tick(ts);
+
+ local_irq_enable();
+}
+
+/**
+ * tick_nohz_irq_exit - update next tick event from interrupt exit
+ *
+ * When an interrupt fires while we are idle and it doesn't cause
+ * a reschedule, it may still add, modify or delete a timer, enqueue
+ * an RCU callback, etc...
+ * So we need to re-calculate and reprogram the next tick event.
+ */
+void tick_nohz_irq_exit(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+ if (!ts->inidle)
+ return;
+
+ tick_nohz_stop_sched_tick(ts);
}
/**
@@ -515,11 +545,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
}
/**
- * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
+ * tick_nohz_idle_exit - restart the idle tick from the idle task
*
* Restart the idle tick when the CPU is woken up from idle
+ * This also exit the RCU extended quiescent state. The CPU
+ * can use RCU again after this function is called.
*/
-void tick_nohz_restart_sched_tick(void)
+void tick_nohz_idle_exit(void)
{
int cpu = smp_processor_id();
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -529,6 +561,7 @@ void tick_nohz_restart_sched_tick(void)
ktime_t now;
local_irq_disable();
+
if (ts->idle_active || (ts->inidle && ts->tick_stopped))
now = ktime_get();
@@ -543,8 +576,6 @@ void tick_nohz_restart_sched_tick(void)
ts->inidle = 0;
- rcu_exit_nohz();
-
/* Update jiffies first */
select_nohz_load_balancer(0);
tick_do_update_jiffies64(now);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2b021b0e850..e6a5a6bc276 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void)
/* calculate the delta since the last update_wall_time: */
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
- /* return delta convert to nanoseconds using ntp adjusted mult. */
+ /* return delta convert to nanoseconds. */
return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
}
@@ -249,6 +249,8 @@ ktime_t ktime_get(void)
secs = xtime.tv_sec + wall_to_monotonic.tv_sec;
nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec;
nsecs += timekeeping_get_ns();
+ /* If arch requires, add in gettimeoffset() */
+ nsecs += arch_gettimeoffset();
} while (read_seqretry(&xtime_lock, seq));
/*
@@ -280,6 +282,8 @@ void ktime_get_ts(struct timespec *ts)
*ts = xtime;
tomono = wall_to_monotonic;
nsecs = timekeeping_get_ns();
+ /* If arch requires, add in gettimeoffset() */
+ nsecs += arch_gettimeoffset();
} while (read_seqretry(&xtime_lock, seq));
@@ -802,14 +806,44 @@ static void timekeeping_adjust(s64 offset)
s64 error, interval = timekeeper.cycle_interval;
int adj;
+ /*
+ * The point of this is to check if the error is greater then half
+ * an interval.
+ *
+ * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
+ *
+ * Note we subtract one in the shift, so that error is really error*2.
+ * This "saves" dividing(shifting) interval twice, but keeps the
+ * (error > interval) comparison as still measuring if error is
+ * larger then half an interval.
+ *
+ * Note: It does not "save" on aggravation when reading the code.
+ */
error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
if (error > interval) {
+ /*
+ * We now divide error by 4(via shift), which checks if
+ * the error is greater then twice the interval.
+ * If it is greater, we need a bigadjust, if its smaller,
+ * we can adjust by 1.
+ */
error >>= 2;
+ /*
+ * XXX - In update_wall_time, we round up to the next
+ * nanosecond, and store the amount rounded up into
+ * the error. This causes the likely below to be unlikely.
+ *
+ * The proper fix is to avoid rounding up by using
+ * the high precision timekeeper.xtime_nsec instead of
+ * xtime.tv_nsec everywhere. Fixing this will take some
+ * time.
+ */
if (likely(error <= interval))
adj = 1;
else
adj = timekeeping_bigadjust(error, &interval, &offset);
} else if (error < -interval) {
+ /* See comment above, this is just switched for the negative */
error >>= 2;
if (likely(error >= -interval)) {
adj = -1;
@@ -817,9 +851,65 @@ static void timekeeping_adjust(s64 offset)
offset = -offset;
} else
adj = timekeeping_bigadjust(error, &interval, &offset);
- } else
+ } else /* No adjustment needed */
return;
+ WARN_ONCE(timekeeper.clock->maxadj &&
+ (timekeeper.mult + adj > timekeeper.clock->mult +
+ timekeeper.clock->maxadj),
+ "Adjusting %s more then 11%% (%ld vs %ld)\n",
+ timekeeper.clock->name, (long)timekeeper.mult + adj,
+ (long)timekeeper.clock->mult +
+ timekeeper.clock->maxadj);
+ /*
+ * So the following can be confusing.
+ *
+ * To keep things simple, lets assume adj == 1 for now.
+ *
+ * When adj != 1, remember that the interval and offset values
+ * have been appropriately scaled so the math is the same.
+ *
+ * The basic idea here is that we're increasing the multiplier
+ * by one, this causes the xtime_interval to be incremented by
+ * one cycle_interval. This is because:
+ * xtime_interval = cycle_interval * mult
+ * So if mult is being incremented by one:
+ * xtime_interval = cycle_interval * (mult + 1)
+ * Its the same as:
+ * xtime_interval = (cycle_interval * mult) + cycle_interval
+ * Which can be shortened to:
+ * xtime_interval += cycle_interval
+ *
+ * So offset stores the non-accumulated cycles. Thus the current
+ * time (in shifted nanoseconds) is:
+ * now = (offset * adj) + xtime_nsec
+ * Now, even though we're adjusting the clock frequency, we have
+ * to keep time consistent. In other words, we can't jump back
+ * in time, and we also want to avoid jumping forward in time.
+ *
+ * So given the same offset value, we need the time to be the same
+ * both before and after the freq adjustment.
+ * now = (offset * adj_1) + xtime_nsec_1
+ * now = (offset * adj_2) + xtime_nsec_2
+ * So:
+ * (offset * adj_1) + xtime_nsec_1 =
+ * (offset * adj_2) + xtime_nsec_2
+ * And we know:
+ * adj_2 = adj_1 + 1
+ * So:
+ * (offset * adj_1) + xtime_nsec_1 =
+ * (offset * (adj_1+1)) + xtime_nsec_2
+ * (offset * adj_1) + xtime_nsec_1 =
+ * (offset * adj_1) + offset + xtime_nsec_2
+ * Canceling the sides:
+ * xtime_nsec_1 = offset + xtime_nsec_2
+ * Which gives us:
+ * xtime_nsec_2 = xtime_nsec_1 - offset
+ * Which simplfies to:
+ * xtime_nsec -= offset
+ *
+ * XXX - TODO: Doc ntp_error calculation.
+ */
timekeeper.mult += adj;
timekeeper.xtime_interval += interval;
timekeeper.xtime_nsec -= offset;
@@ -1140,6 +1230,8 @@ ktime_t ktime_get_monotonic_offset(void)
} while (read_seqretry(&xtime_lock, seq));
return timespec_to_ktime(wtom);
}
+EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
+
/**
* xtime_update() - advances the timekeeping infrastructure
diff --git a/kernel/timer.c b/kernel/timer.c
index dbaa62422b1..a297ffcf888 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -427,6 +427,12 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
}
}
+/* Stub timer callback for improperly used timers. */
+static void stub_timer(unsigned long data)
+{
+ WARN_ON(1);
+}
+
/*
* fixup_activate is called when:
* - an active object is activated
@@ -450,7 +456,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
debug_object_activate(timer, &timer_debug_descr);
return 0;
} else {
- WARN_ON_ONCE(1);
+ setup_timer(timer, stub_timer, 0);
+ return 1;
}
return 0;
@@ -480,12 +487,40 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
}
}
+/*
+ * fixup_assert_init is called when:
+ * - an untracked/uninit-ed object is found
+ */
+static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
+{
+ struct timer_list *timer = addr;
+
+ switch (state) {
+ case ODEBUG_STATE_NOTAVAILABLE:
+ if (timer->entry.prev == TIMER_ENTRY_STATIC) {
+ /*
+ * This is not really a fixup. The timer was
+ * statically initialized. We just make sure that it
+ * is tracked in the object tracker.
+ */
+ debug_object_init(timer, &timer_debug_descr);
+ return 0;
+ } else {
+ setup_timer(timer, stub_timer, 0);
+ return 1;
+ }
+ default:
+ return 0;
+ }
+}
+
static struct debug_obj_descr timer_debug_descr = {
- .name = "timer_list",
- .debug_hint = timer_debug_hint,
- .fixup_init = timer_fixup_init,
- .fixup_activate = timer_fixup_activate,
- .fixup_free = timer_fixup_free,
+ .name = "timer_list",
+ .debug_hint = timer_debug_hint,
+ .fixup_init = timer_fixup_init,
+ .fixup_activate = timer_fixup_activate,
+ .fixup_free = timer_fixup_free,
+ .fixup_assert_init = timer_fixup_assert_init,
};
static inline void debug_timer_init(struct timer_list *timer)
@@ -508,6 +543,11 @@ static inline void debug_timer_free(struct timer_list *timer)
debug_object_free(timer, &timer_debug_descr);
}
+static inline void debug_timer_assert_init(struct timer_list *timer)
+{
+ debug_object_assert_init(timer, &timer_debug_descr);
+}
+
static void __init_timer(struct timer_list *timer,
const char *name,
struct lock_class_key *key);
@@ -531,6 +571,7 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
static inline void debug_timer_init(struct timer_list *timer) { }
static inline void debug_timer_activate(struct timer_list *timer) { }
static inline void debug_timer_deactivate(struct timer_list *timer) { }
+static inline void debug_timer_assert_init(struct timer_list *timer) { }
#endif
static inline void debug_init(struct timer_list *timer)
@@ -552,6 +593,11 @@ static inline void debug_deactivate(struct timer_list *timer)
trace_timer_cancel(timer);
}
+static inline void debug_assert_init(struct timer_list *timer)
+{
+ debug_timer_assert_init(timer);
+}
+
static void __init_timer(struct timer_list *timer,
const char *name,
struct lock_class_key *key)
@@ -902,6 +948,8 @@ int del_timer(struct timer_list *timer)
unsigned long flags;
int ret = 0;
+ debug_assert_init(timer);
+
timer_stats_timer_clear_start_info(timer);
if (timer_pending(timer)) {
base = lock_timer_base(timer, &flags);
@@ -932,6 +980,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
unsigned long flags;
int ret = -1;
+ debug_assert_init(timer);
+
base = lock_timer_base(timer, &flags);
if (base->running_timer == timer)
@@ -1368,7 +1418,7 @@ SYSCALL_DEFINE0(getppid)
int pid;
rcu_read_lock();
- pid = task_tgid_vnr(current->real_parent);
+ pid = task_tgid_vnr(rcu_dereference(current->real_parent));
rcu_read_unlock();
return pid;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 16fc34a0806..cdea7b56b0c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -402,7 +402,7 @@ static int blk_remove_buf_file_callback(struct dentry *dentry)
static struct dentry *blk_create_buf_file_callback(const char *filename,
struct dentry *parent,
- int mode,
+ umode_t mode,
struct rchan_buf *buf,
int *is_global)
{
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 900b409543d..683d559a0ee 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,11 +22,13 @@
#include <linux/hardirq.h>
#include <linux/kthread.h>
#include <linux/uaccess.h>
+#include <linux/bsearch.h>
#include <linux/module.h>
#include <linux/ftrace.h>
#include <linux/sysctl.h>
#include <linux/slab.h>
#include <linux/ctype.h>
+#include <linux/sort.h>
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/rcupdate.h>
@@ -152,7 +154,6 @@ void clear_ftrace_function(void)
ftrace_pid_function = ftrace_stub;
}
-#undef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
/*
* For those archs that do not test ftrace_trace_stop in their
@@ -948,13 +949,6 @@ struct ftrace_func_probe {
struct rcu_head rcu;
};
-enum {
- FTRACE_ENABLE_CALLS = (1 << 0),
- FTRACE_DISABLE_CALLS = (1 << 1),
- FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
- FTRACE_START_FUNC_RET = (1 << 3),
- FTRACE_STOP_FUNC_RET = (1 << 4),
-};
struct ftrace_func_entry {
struct hlist_node hlist;
unsigned long ip;
@@ -985,18 +979,19 @@ static struct ftrace_ops global_ops = {
.filter_hash = EMPTY_HASH,
};
-static struct dyn_ftrace *ftrace_new_addrs;
-
static DEFINE_MUTEX(ftrace_regex_lock);
struct ftrace_page {
struct ftrace_page *next;
+ struct dyn_ftrace *records;
int index;
- struct dyn_ftrace records[];
+ int size;
};
-#define ENTRIES_PER_PAGE \
- ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+static struct ftrace_page *ftrace_new_pgs;
+
+#define ENTRY_SIZE sizeof(struct dyn_ftrace)
+#define ENTRIES_PER_PAGE (PAGE_SIZE / ENTRY_SIZE)
/* estimate from running different kernels */
#define NR_TO_INIT 10000
@@ -1004,7 +999,10 @@ struct ftrace_page {
static struct ftrace_page *ftrace_pages_start;
static struct ftrace_page *ftrace_pages;
-static struct dyn_ftrace *ftrace_free_records;
+static bool ftrace_hash_empty(struct ftrace_hash *hash)
+{
+ return !hash || !hash->count;
+}
static struct ftrace_func_entry *
ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
@@ -1014,7 +1012,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
struct hlist_head *hhd;
struct hlist_node *n;
- if (!hash->count)
+ if (ftrace_hash_empty(hash))
return NULL;
if (hash->size_bits > 0)
@@ -1158,7 +1156,7 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
return NULL;
/* Empty hash? */
- if (!hash || !hash->count)
+ if (ftrace_hash_empty(hash))
return new_hash;
size = 1 << hash->size_bits;
@@ -1212,7 +1210,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
if (!src->count) {
free_ftrace_hash_rcu(*dst);
rcu_assign_pointer(*dst, EMPTY_HASH);
- return 0;
+ /* still need to update the function records */
+ ret = 0;
+ goto out;
}
/*
@@ -1281,9 +1281,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
filter_hash = rcu_dereference_raw(ops->filter_hash);
notrace_hash = rcu_dereference_raw(ops->notrace_hash);
- if ((!filter_hash || !filter_hash->count ||
+ if ((ftrace_hash_empty(filter_hash) ||
ftrace_lookup_ip(filter_hash, ip)) &&
- (!notrace_hash || !notrace_hash->count ||
+ (ftrace_hash_empty(notrace_hash) ||
!ftrace_lookup_ip(notrace_hash, ip)))
ret = 1;
else
@@ -1306,6 +1306,47 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
} \
}
+
+static int ftrace_cmp_recs(const void *a, const void *b)
+{
+ const struct dyn_ftrace *reca = a;
+ const struct dyn_ftrace *recb = b;
+
+ if (reca->ip > recb->ip)
+ return 1;
+ if (reca->ip < recb->ip)
+ return -1;
+ return 0;
+}
+
+/**
+ * ftrace_location - return true if the ip giving is a traced location
+ * @ip: the instruction pointer to check
+ *
+ * Returns 1 if @ip given is a pointer to a ftrace location.
+ * That is, the instruction that is either a NOP or call to
+ * the function tracer. It checks the ftrace internal tables to
+ * determine if the address belongs or not.
+ */
+int ftrace_location(unsigned long ip)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ struct dyn_ftrace key;
+
+ key.ip = ip;
+
+ for (pg = ftrace_pages_start; pg; pg = pg->next) {
+ rec = bsearch(&key, pg->records, pg->index,
+ sizeof(struct dyn_ftrace),
+ ftrace_cmp_recs);
+ if (rec)
+ return 1;
+ }
+
+ return 0;
+}
+
static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
@@ -1335,7 +1376,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (filter_hash) {
hash = ops->filter_hash;
other_hash = ops->notrace_hash;
- if (!hash || !hash->count)
+ if (ftrace_hash_empty(hash))
all = 1;
} else {
inc = !inc;
@@ -1345,7 +1386,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
* If the notrace hash has no items,
* then there's nothing to do.
*/
- if (hash && !hash->count)
+ if (ftrace_hash_empty(hash))
return;
}
@@ -1362,8 +1403,8 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
match = 1;
} else {
- in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
- in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
+ in_hash = !!ftrace_lookup_ip(hash, rec->ip);
+ in_other_hash = !!ftrace_lookup_ip(other_hash, rec->ip);
/*
*
@@ -1371,7 +1412,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (filter_hash && in_hash && !in_other_hash)
match = 1;
else if (!filter_hash && in_hash &&
- (in_other_hash || !other_hash->count))
+ (in_other_hash || ftrace_hash_empty(other_hash)))
match = 1;
}
if (!match)
@@ -1405,40 +1446,12 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
__ftrace_hash_rec_update(ops, filter_hash, 1);
}
-static void ftrace_free_rec(struct dyn_ftrace *rec)
-{
- rec->freelist = ftrace_free_records;
- ftrace_free_records = rec;
- rec->flags |= FTRACE_FL_FREE;
-}
-
static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
{
- struct dyn_ftrace *rec;
-
- /* First check for freed records */
- if (ftrace_free_records) {
- rec = ftrace_free_records;
-
- if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
- FTRACE_WARN_ON_ONCE(1);
- ftrace_free_records = NULL;
+ if (ftrace_pages->index == ftrace_pages->size) {
+ /* We should have allocated enough */
+ if (WARN_ON(!ftrace_pages->next))
return NULL;
- }
-
- ftrace_free_records = rec->freelist;
- memset(rec, 0, sizeof(*rec));
- return rec;
- }
-
- if (ftrace_pages->index == ENTRIES_PER_PAGE) {
- if (!ftrace_pages->next) {
- /* allocate another page */
- ftrace_pages->next =
- (void *)get_zeroed_page(GFP_KERNEL);
- if (!ftrace_pages->next)
- return NULL;
- }
ftrace_pages = ftrace_pages->next;
}
@@ -1458,8 +1471,6 @@ ftrace_record_ip(unsigned long ip)
return NULL;
rec->ip = ip;
- rec->newlist = ftrace_new_addrs;
- ftrace_new_addrs = rec;
return rec;
}
@@ -1474,7 +1485,19 @@ static void print_ip_ins(const char *fmt, unsigned char *p)
printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
}
-static void ftrace_bug(int failed, unsigned long ip)
+/**
+ * ftrace_bug - report and shutdown function tracer
+ * @failed: The failed type (EFAULT, EINVAL, EPERM)
+ * @ip: The address that failed
+ *
+ * The arch code that enables or disables the function tracing
+ * can call ftrace_bug() when it has detected a problem in
+ * modifying the code. @failed should be one of either:
+ * EFAULT - if the problem happens on reading the @ip address
+ * EINVAL - if what is read at @ip is not what was expected
+ * EPERM - if the problem happens on writting to the @ip address
+ */
+void ftrace_bug(int failed, unsigned long ip)
{
switch (failed) {
case -EFAULT:
@@ -1516,24 +1539,19 @@ int ftrace_text_reserved(void *start, void *end)
return 0;
}
-
-static int
-__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
{
- unsigned long ftrace_addr;
unsigned long flag = 0UL;
- ftrace_addr = (unsigned long)FTRACE_ADDR;
-
/*
- * If we are enabling tracing:
+ * If we are updating calls:
*
* If the record has a ref count, then we need to enable it
* because someone is using it.
*
* Otherwise we make sure its disabled.
*
- * If we are disabling tracing, then disable all records that
+ * If we are disabling calls, then disable all records that
* are enabled.
*/
if (enable && (rec->flags & ~FTRACE_FL_MASK))
@@ -1541,18 +1559,72 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
/* If the state of this record hasn't changed, then do nothing */
if ((rec->flags & FTRACE_FL_ENABLED) == flag)
- return 0;
+ return FTRACE_UPDATE_IGNORE;
if (flag) {
- rec->flags |= FTRACE_FL_ENABLED;
+ if (update)
+ rec->flags |= FTRACE_FL_ENABLED;
+ return FTRACE_UPDATE_MAKE_CALL;
+ }
+
+ if (update)
+ rec->flags &= ~FTRACE_FL_ENABLED;
+
+ return FTRACE_UPDATE_MAKE_NOP;
+}
+
+/**
+ * ftrace_update_record, set a record that now is tracing or not
+ * @rec: the record to update
+ * @enable: set to 1 if the record is tracing, zero to force disable
+ *
+ * The records that represent all functions that can be traced need
+ * to be updated when tracing has been enabled.
+ */
+int ftrace_update_record(struct dyn_ftrace *rec, int enable)
+{
+ return ftrace_check_record(rec, enable, 1);
+}
+
+/**
+ * ftrace_test_record, check if the record has been enabled or not
+ * @rec: the record to test
+ * @enable: set to 1 to check if enabled, 0 if it is disabled
+ *
+ * The arch code may need to test if a record is already set to
+ * tracing to determine how to modify the function code that it
+ * represents.
+ */
+int ftrace_test_record(struct dyn_ftrace *rec, int enable)
+{
+ return ftrace_check_record(rec, enable, 0);
+}
+
+static int
+__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
+{
+ unsigned long ftrace_addr;
+ int ret;
+
+ ftrace_addr = (unsigned long)FTRACE_ADDR;
+
+ ret = ftrace_update_record(rec, enable);
+
+ switch (ret) {
+ case FTRACE_UPDATE_IGNORE:
+ return 0;
+
+ case FTRACE_UPDATE_MAKE_CALL:
return ftrace_make_call(rec, ftrace_addr);
+
+ case FTRACE_UPDATE_MAKE_NOP:
+ return ftrace_make_nop(NULL, rec, ftrace_addr);
}
- rec->flags &= ~FTRACE_FL_ENABLED;
- return ftrace_make_nop(NULL, rec, ftrace_addr);
+ return -1; /* unknow ftrace bug */
}
-static void ftrace_replace_code(int enable)
+static void ftrace_replace_code(int update)
{
struct dyn_ftrace *rec;
struct ftrace_page *pg;
@@ -1562,11 +1634,7 @@ static void ftrace_replace_code(int enable)
return;
do_for_each_ftrace_rec(pg, rec) {
- /* Skip over free records */
- if (rec->flags & FTRACE_FL_FREE)
- continue;
-
- failed = __ftrace_replace_code(rec, enable);
+ failed = __ftrace_replace_code(rec, update);
if (failed) {
ftrace_bug(failed, rec->ip);
/* Stop processing */
@@ -1575,6 +1643,78 @@ static void ftrace_replace_code(int enable)
} while_for_each_ftrace_rec();
}
+struct ftrace_rec_iter {
+ struct ftrace_page *pg;
+ int index;
+};
+
+/**
+ * ftrace_rec_iter_start, start up iterating over traced functions
+ *
+ * Returns an iterator handle that is used to iterate over all
+ * the records that represent address locations where functions
+ * are traced.
+ *
+ * May return NULL if no records are available.
+ */
+struct ftrace_rec_iter *ftrace_rec_iter_start(void)
+{
+ /*
+ * We only use a single iterator.
+ * Protected by the ftrace_lock mutex.
+ */
+ static struct ftrace_rec_iter ftrace_rec_iter;
+ struct ftrace_rec_iter *iter = &ftrace_rec_iter;
+
+ iter->pg = ftrace_pages_start;
+ iter->index = 0;
+
+ /* Could have empty pages */
+ while (iter->pg && !iter->pg->index)
+ iter->pg = iter->pg->next;
+
+ if (!iter->pg)
+ return NULL;
+
+ return iter;
+}
+
+/**
+ * ftrace_rec_iter_next, get the next record to process.
+ * @iter: The handle to the iterator.
+ *
+ * Returns the next iterator after the given iterator @iter.
+ */
+struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
+{
+ iter->index++;
+
+ if (iter->index >= iter->pg->index) {
+ iter->pg = iter->pg->next;
+ iter->index = 0;
+
+ /* Could have empty pages */
+ while (iter->pg && !iter->pg->index)
+ iter->pg = iter->pg->next;
+ }
+
+ if (!iter->pg)
+ return NULL;
+
+ return iter;
+}
+
+/**
+ * ftrace_rec_iter_record, get the record at the iterator location
+ * @iter: The current iterator location
+ *
+ * Returns the record that the current @iter is at.
+ */
+struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
+{
+ return &iter->pg->records[iter->index];
+}
+
static int
ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
{
@@ -1616,13 +1756,7 @@ static int __ftrace_modify_code(void *data)
{
int *command = data;
- /*
- * Do not call function tracer while we update the code.
- * We are in stop machine, no worrying about races.
- */
- function_trace_stop++;
-
- if (*command & FTRACE_ENABLE_CALLS)
+ if (*command & FTRACE_UPDATE_CALLS)
ftrace_replace_code(1);
else if (*command & FTRACE_DISABLE_CALLS)
ftrace_replace_code(0);
@@ -1635,21 +1769,33 @@ static int __ftrace_modify_code(void *data)
else if (*command & FTRACE_STOP_FUNC_RET)
ftrace_disable_ftrace_graph_caller();
-#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
- /*
- * For archs that call ftrace_test_stop_func(), we must
- * wait till after we update all the function callers
- * before we update the callback. This keeps different
- * ops that record different functions from corrupting
- * each other.
- */
- __ftrace_trace_function = __ftrace_trace_function_delay;
-#endif
- function_trace_stop--;
-
return 0;
}
+/**
+ * ftrace_run_stop_machine, go back to the stop machine method
+ * @command: The command to tell ftrace what to do
+ *
+ * If an arch needs to fall back to the stop machine method, the
+ * it can call this function.
+ */
+void ftrace_run_stop_machine(int command)
+{
+ stop_machine(__ftrace_modify_code, &command, NULL);
+}
+
+/**
+ * arch_ftrace_update_code, modify the code to trace or not trace
+ * @command: The command that needs to be done
+ *
+ * Archs can override this function if it does not need to
+ * run stop_machine() to modify code.
+ */
+void __weak arch_ftrace_update_code(int command)
+{
+ ftrace_run_stop_machine(command);
+}
+
static void ftrace_run_update_code(int command)
{
int ret;
@@ -1658,8 +1804,31 @@ static void ftrace_run_update_code(int command)
FTRACE_WARN_ON(ret);
if (ret)
return;
+ /*
+ * Do not call function tracer while we update the code.
+ * We are in stop machine.
+ */
+ function_trace_stop++;
- stop_machine(__ftrace_modify_code, &command, NULL);
+ /*
+ * By default we use stop_machine() to modify the code.
+ * But archs can do what ever they want as long as it
+ * is safe. The stop_machine() is the safest, but also
+ * produces the most overhead.
+ */
+ arch_ftrace_update_code(command);
+
+#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
+ /*
+ * For archs that call ftrace_test_stop_func(), we must
+ * wait till after we update all the function callers
+ * before we update the callback. This keeps different
+ * ops that record different functions from corrupting
+ * each other.
+ */
+ __ftrace_trace_function = __ftrace_trace_function_delay;
+#endif
+ function_trace_stop--;
ret = ftrace_arch_code_modify_post_process();
FTRACE_WARN_ON(ret);
@@ -1690,7 +1859,7 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
return -ENODEV;
ftrace_start_up++;
- command |= FTRACE_ENABLE_CALLS;
+ command |= FTRACE_UPDATE_CALLS;
/* ops marked global share the filter hashes */
if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
@@ -1742,8 +1911,7 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
if (ops != &global_ops || !global_start_up)
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
- if (!ftrace_start_up)
- command |= FTRACE_DISABLE_CALLS;
+ command |= FTRACE_UPDATE_CALLS;
if (saved_ftrace_func != ftrace_trace_function) {
saved_ftrace_func = ftrace_trace_function;
@@ -1765,7 +1933,7 @@ static void ftrace_startup_sysctl(void)
saved_ftrace_func = NULL;
/* ftrace_start_up is true if we want ftrace running */
if (ftrace_start_up)
- ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+ ftrace_run_update_code(FTRACE_UPDATE_CALLS);
}
static void ftrace_shutdown_sysctl(void)
@@ -1787,14 +1955,16 @@ static int ops_traces_mod(struct ftrace_ops *ops)
struct ftrace_hash *hash;
hash = ops->filter_hash;
- return !!(!hash || !hash->count);
+ return ftrace_hash_empty(hash);
}
static int ftrace_update_code(struct module *mod)
{
+ struct ftrace_page *pg;
struct dyn_ftrace *p;
cycle_t start, stop;
unsigned long ref = 0;
+ int i;
/*
* When adding a module, we need to check if tracers are
@@ -1816,46 +1986,44 @@ static int ftrace_update_code(struct module *mod)
start = ftrace_now(raw_smp_processor_id());
ftrace_update_cnt = 0;
- while (ftrace_new_addrs) {
+ for (pg = ftrace_new_pgs; pg; pg = pg->next) {
- /* If something went wrong, bail without enabling anything */
- if (unlikely(ftrace_disabled))
- return -1;
+ for (i = 0; i < pg->index; i++) {
+ /* If something went wrong, bail without enabling anything */
+ if (unlikely(ftrace_disabled))
+ return -1;
- p = ftrace_new_addrs;
- ftrace_new_addrs = p->newlist;
- p->flags = ref;
+ p = &pg->records[i];
+ p->flags = ref;
- /*
- * Do the initial record conversion from mcount jump
- * to the NOP instructions.
- */
- if (!ftrace_code_disable(mod, p)) {
- ftrace_free_rec(p);
- /* Game over */
- break;
- }
+ /*
+ * Do the initial record conversion from mcount jump
+ * to the NOP instructions.
+ */
+ if (!ftrace_code_disable(mod, p))
+ break;
- ftrace_update_cnt++;
+ ftrace_update_cnt++;
- /*
- * If the tracing is enabled, go ahead and enable the record.
- *
- * The reason not to enable the record immediatelly is the
- * inherent check of ftrace_make_nop/ftrace_make_call for
- * correct previous instructions. Making first the NOP
- * conversion puts the module to the correct state, thus
- * passing the ftrace_make_call check.
- */
- if (ftrace_start_up && ref) {
- int failed = __ftrace_replace_code(p, 1);
- if (failed) {
- ftrace_bug(failed, p->ip);
- ftrace_free_rec(p);
+ /*
+ * If the tracing is enabled, go ahead and enable the record.
+ *
+ * The reason not to enable the record immediatelly is the
+ * inherent check of ftrace_make_nop/ftrace_make_call for
+ * correct previous instructions. Making first the NOP
+ * conversion puts the module to the correct state, thus
+ * passing the ftrace_make_call check.
+ */
+ if (ftrace_start_up && ref) {
+ int failed = __ftrace_replace_code(p, 1);
+ if (failed)
+ ftrace_bug(failed, p->ip);
}
}
}
+ ftrace_new_pgs = NULL;
+
stop = ftrace_now(raw_smp_processor_id());
ftrace_update_time = stop - start;
ftrace_update_tot_cnt += ftrace_update_cnt;
@@ -1863,57 +2031,108 @@ static int ftrace_update_code(struct module *mod)
return 0;
}
-static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
+static int ftrace_allocate_records(struct ftrace_page *pg, int count)
{
- struct ftrace_page *pg;
+ int order;
int cnt;
- int i;
- /* allocate a few pages */
- ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
- if (!ftrace_pages_start)
- return -1;
+ if (WARN_ON(!count))
+ return -EINVAL;
+
+ order = get_count_order(DIV_ROUND_UP(count, ENTRIES_PER_PAGE));
/*
- * Allocate a few more pages.
- *
- * TODO: have some parser search vmlinux before
- * final linking to find all calls to ftrace.
- * Then we can:
- * a) know how many pages to allocate.
- * and/or
- * b) set up the table then.
- *
- * The dynamic code is still necessary for
- * modules.
+ * We want to fill as much as possible. No more than a page
+ * may be empty.
*/
+ while ((PAGE_SIZE << order) / ENTRY_SIZE >= count + ENTRIES_PER_PAGE)
+ order--;
- pg = ftrace_pages = ftrace_pages_start;
+ again:
+ pg->records = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
- cnt = num_to_init / ENTRIES_PER_PAGE;
- pr_info("ftrace: allocating %ld entries in %d pages\n",
- num_to_init, cnt + 1);
+ if (!pg->records) {
+ /* if we can't allocate this size, try something smaller */
+ if (!order)
+ return -ENOMEM;
+ order >>= 1;
+ goto again;
+ }
- for (i = 0; i < cnt; i++) {
- pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+ cnt = (PAGE_SIZE << order) / ENTRY_SIZE;
+ pg->size = cnt;
- /* If we fail, we'll try later anyway */
- if (!pg->next)
+ if (cnt > count)
+ cnt = count;
+
+ return cnt;
+}
+
+static struct ftrace_page *
+ftrace_allocate_pages(unsigned long num_to_init)
+{
+ struct ftrace_page *start_pg;
+ struct ftrace_page *pg;
+ int order;
+ int cnt;
+
+ if (!num_to_init)
+ return 0;
+
+ start_pg = pg = kzalloc(sizeof(*pg), GFP_KERNEL);
+ if (!pg)
+ return NULL;
+
+ /*
+ * Try to allocate as much as possible in one continues
+ * location that fills in all of the space. We want to
+ * waste as little space as possible.
+ */
+ for (;;) {
+ cnt = ftrace_allocate_records(pg, num_to_init);
+ if (cnt < 0)
+ goto free_pages;
+
+ num_to_init -= cnt;
+ if (!num_to_init)
break;
+ pg->next = kzalloc(sizeof(*pg), GFP_KERNEL);
+ if (!pg->next)
+ goto free_pages;
+
pg = pg->next;
}
- return 0;
+ return start_pg;
+
+ free_pages:
+ while (start_pg) {
+ order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+ free_pages((unsigned long)pg->records, order);
+ start_pg = pg->next;
+ kfree(pg);
+ pg = start_pg;
+ }
+ pr_info("ftrace: FAILED to allocate memory for functions\n");
+ return NULL;
}
-enum {
- FTRACE_ITER_FILTER = (1 << 0),
- FTRACE_ITER_NOTRACE = (1 << 1),
- FTRACE_ITER_PRINTALL = (1 << 2),
- FTRACE_ITER_HASH = (1 << 3),
- FTRACE_ITER_ENABLED = (1 << 4),
-};
+static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
+{
+ int cnt;
+
+ if (!num_to_init) {
+ pr_info("ftrace: No functions to be traced?\n");
+ return -1;
+ }
+
+ cnt = num_to_init / ENTRIES_PER_PAGE;
+ pr_info("ftrace: allocating %ld entries in %d pages\n",
+ num_to_init, cnt + 1);
+
+ return 0;
+}
#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1979,6 +2198,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
void *p = NULL;
loff_t l;
+ if (!(iter->flags & FTRACE_ITER_DO_HASH))
+ return NULL;
+
if (iter->func_pos > *pos)
return NULL;
@@ -2022,7 +2244,7 @@ static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
- struct ftrace_ops *ops = &global_ops;
+ struct ftrace_ops *ops = iter->ops;
struct dyn_ftrace *rec = NULL;
if (unlikely(ftrace_disabled))
@@ -2046,9 +2268,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
}
} else {
rec = &iter->pg->records[iter->idx++];
- if ((rec->flags & FTRACE_FL_FREE) ||
-
- ((iter->flags & FTRACE_ITER_FILTER) &&
+ if (((iter->flags & FTRACE_ITER_FILTER) &&
!(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
((iter->flags & FTRACE_ITER_NOTRACE) &&
@@ -2080,7 +2300,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
static void *t_start(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
- struct ftrace_ops *ops = &global_ops;
+ struct ftrace_ops *ops = iter->ops;
void *p = NULL;
loff_t l;
@@ -2100,7 +2320,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
* off, we can short cut and just print out that all
* functions are enabled.
*/
- if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
+ if (iter->flags & FTRACE_ITER_FILTER &&
+ ftrace_hash_empty(ops->filter_hash)) {
if (*pos > 0)
return t_hash_start(m, pos);
iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2125,12 +2346,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
break;
}
- if (!p) {
- if (iter->flags & FTRACE_ITER_FILTER)
- return t_hash_start(m, pos);
-
- return NULL;
- }
+ if (!p)
+ return t_hash_start(m, pos);
return iter;
}
@@ -2188,6 +2405,7 @@ ftrace_avail_open(struct inode *inode, struct file *file)
return -ENOMEM;
iter->pg = ftrace_pages_start;
+ iter->ops = &global_ops;
ret = seq_open(file, &show_ftrace_seq_ops);
if (!ret) {
@@ -2216,6 +2434,7 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
iter->pg = ftrace_pages_start;
iter->flags = FTRACE_ITER_ENABLED;
+ iter->ops = &global_ops;
ret = seq_open(file, &show_ftrace_seq_ops);
if (!ret) {
@@ -2236,7 +2455,23 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
mutex_unlock(&ftrace_lock);
}
-static int
+/**
+ * ftrace_regex_open - initialize function tracer filter files
+ * @ops: The ftrace_ops that hold the hash filters
+ * @flag: The type of filter to process
+ * @inode: The inode, usually passed in to your open routine
+ * @file: The file, usually passed in to your open routine
+ *
+ * ftrace_regex_open() initializes the filter files for the
+ * @ops. Depending on @flag it may process the filter hash or
+ * the notrace hash of @ops. With this called from the open
+ * routine, you can use ftrace_filter_write() for the write
+ * routine if @flag has FTRACE_ITER_FILTER set, or
+ * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
+ * ftrace_regex_lseek() should be used as the lseek routine, and
+ * release must call ftrace_regex_release().
+ */
+int
ftrace_regex_open(struct ftrace_ops *ops, int flag,
struct inode *inode, struct file *file)
{
@@ -2305,8 +2540,9 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
static int
ftrace_filter_open(struct inode *inode, struct file *file)
{
- return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
- inode, file);
+ return ftrace_regex_open(&global_ops,
+ FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH,
+ inode, file);
}
static int
@@ -2316,7 +2552,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
inode, file);
}
-static loff_t
+loff_t
ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
{
loff_t ret;
@@ -2425,7 +2661,6 @@ match_records(struct ftrace_hash *hash, char *buff,
goto out_unlock;
do_for_each_ftrace_rec(pg, rec) {
-
if (ftrace_match_record(rec, mod, search, search_len, type)) {
ret = enter_record(hash, rec, not);
if (ret < 0) {
@@ -2870,14 +3105,14 @@ out_unlock:
return ret;
}
-static ssize_t
+ssize_t
ftrace_filter_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
}
-static ssize_t
+ssize_t
ftrace_notrace_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -2918,7 +3153,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
ret = ftrace_hash_move(ops, enable, orig_hash, hash);
if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED
&& ftrace_enabled)
- ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+ ftrace_run_update_code(FTRACE_UPDATE_CALLS);
mutex_unlock(&ftrace_lock);
@@ -3044,8 +3279,8 @@ static void __init set_ftrace_early_graph(char *buf)
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-static void __init
-set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
+void __init
+ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
{
char *func;
@@ -3058,17 +3293,16 @@ set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
static void __init set_ftrace_early_filters(void)
{
if (ftrace_filter_buf[0])
- set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
+ ftrace_set_early_filter(&global_ops, ftrace_filter_buf, 1);
if (ftrace_notrace_buf[0])
- set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
+ ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
if (ftrace_graph_buf[0])
set_ftrace_early_graph(ftrace_graph_buf);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
}
-static int
-ftrace_regex_release(struct inode *inode, struct file *file)
+int ftrace_regex_release(struct inode *inode, struct file *file)
{
struct seq_file *m = (struct seq_file *)file->private_data;
struct ftrace_iterator *iter;
@@ -3106,7 +3340,7 @@ ftrace_regex_release(struct inode *inode, struct file *file)
orig_hash, iter->hash);
if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED)
&& ftrace_enabled)
- ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+ ftrace_run_update_code(FTRACE_UPDATE_CALLS);
mutex_unlock(&ftrace_lock);
}
@@ -3269,9 +3503,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
do_for_each_ftrace_rec(pg, rec) {
- if (rec->flags & FTRACE_FL_FREE)
- continue;
-
if (ftrace_match_record(rec, NULL, search, search_len, type)) {
/* if it is in the array */
exists = false;
@@ -3380,15 +3611,62 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
return 0;
}
+static void ftrace_swap_recs(void *a, void *b, int size)
+{
+ struct dyn_ftrace *reca = a;
+ struct dyn_ftrace *recb = b;
+ struct dyn_ftrace t;
+
+ t = *reca;
+ *reca = *recb;
+ *recb = t;
+}
+
static int ftrace_process_locs(struct module *mod,
unsigned long *start,
unsigned long *end)
{
+ struct ftrace_page *pg;
+ unsigned long count;
unsigned long *p;
unsigned long addr;
unsigned long flags = 0; /* Shut up gcc */
+ int ret = -ENOMEM;
+
+ count = end - start;
+
+ if (!count)
+ return 0;
+
+ pg = ftrace_allocate_pages(count);
+ if (!pg)
+ return -ENOMEM;
mutex_lock(&ftrace_lock);
+
+ /*
+ * Core and each module needs their own pages, as
+ * modules will free them when they are removed.
+ * Force a new page to be allocated for modules.
+ */
+ if (!mod) {
+ WARN_ON(ftrace_pages || ftrace_pages_start);
+ /* First initialization */
+ ftrace_pages = ftrace_pages_start = pg;
+ } else {
+ if (!ftrace_pages)
+ goto out;
+
+ if (WARN_ON(ftrace_pages->next)) {
+ /* Hmm, we have free pages? */
+ while (ftrace_pages->next)
+ ftrace_pages = ftrace_pages->next;
+ }
+
+ ftrace_pages->next = pg;
+ ftrace_pages = pg;
+ }
+
p = start;
while (p < end) {
addr = ftrace_call_adjust(*p++);
@@ -3400,9 +3678,18 @@ static int ftrace_process_locs(struct module *mod,
*/
if (!addr)
continue;
- ftrace_record_ip(addr);
+ if (!ftrace_record_ip(addr))
+ break;
}
+ /* These new locations need to be initialized */
+ ftrace_new_pgs = pg;
+
+ /* Make each individual set of pages sorted by ips */
+ for (; pg; pg = pg->next)
+ sort(pg->records, pg->index, sizeof(struct dyn_ftrace),
+ ftrace_cmp_recs, ftrace_swap_recs);
+
/*
* We only need to disable interrupts on start up
* because we are modifying code that an interrupt
@@ -3416,32 +3703,55 @@ static int ftrace_process_locs(struct module *mod,
ftrace_update_code(mod);
if (!mod)
local_irq_restore(flags);
+ ret = 0;
+ out:
mutex_unlock(&ftrace_lock);
- return 0;
+ return ret;
}
#ifdef CONFIG_MODULES
+
+#define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
+
void ftrace_release_mod(struct module *mod)
{
struct dyn_ftrace *rec;
+ struct ftrace_page **last_pg;
struct ftrace_page *pg;
+ int order;
mutex_lock(&ftrace_lock);
if (ftrace_disabled)
goto out_unlock;
- do_for_each_ftrace_rec(pg, rec) {
+ /*
+ * Each module has its own ftrace_pages, remove
+ * them from the list.
+ */
+ last_pg = &ftrace_pages_start;
+ for (pg = ftrace_pages_start; pg; pg = *last_pg) {
+ rec = &pg->records[0];
if (within_module_core(rec->ip, mod)) {
/*
- * rec->ip is changed in ftrace_free_rec()
- * It should not between s and e if record was freed.
+ * As core pages are first, the first
+ * page should never be a module page.
*/
- FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
- ftrace_free_rec(rec);
- }
- } while_for_each_ftrace_rec();
+ if (WARN_ON(pg == ftrace_pages_start))
+ goto out_unlock;
+
+ /* Check if we are deleting the last page */
+ if (pg == ftrace_pages)
+ ftrace_pages = next_to_ftrace_page(last_pg);
+
+ *last_pg = pg->next;
+ order = get_count_order(pg->size / ENTRIES_PER_PAGE);
+ free_pages((unsigned long)pg->records, order);
+ kfree(pg);
+ } else
+ last_pg = &pg->next;
+ }
out_unlock:
mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f2bd275bb60..a3f1bc5d2a0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -338,7 +338,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
/* trace_flags holds trace_options default values */
unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
- TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
+ TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
+ TRACE_ITER_IRQ_INFO;
static int trace_stop_count;
static DEFINE_RAW_SPINLOCK(tracing_start_lock);
@@ -426,6 +427,7 @@ static const char *trace_options[] = {
"record-cmd",
"overwrite",
"disable_on_free",
+ "irq-info",
NULL
};
@@ -1843,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p)
trace_event_read_unlock();
}
+static void
+get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
+{
+ unsigned long count;
+ int cpu;
+
+ *total = 0;
+ *entries = 0;
+
+ for_each_tracing_cpu(cpu) {
+ count = ring_buffer_entries_cpu(tr->buffer, cpu);
+ /*
+ * If this buffer has skipped entries, then we hold all
+ * entries for the trace and we need to ignore the
+ * ones before the time stamp.
+ */
+ if (tr->data[cpu]->skipped_entries) {
+ count -= tr->data[cpu]->skipped_entries;
+ /* total is the same as the entries */
+ *total += count;
+ } else
+ *total += count +
+ ring_buffer_overrun_cpu(tr->buffer, cpu);
+ *entries += count;
+ }
+}
+
static void print_lat_help_header(struct seq_file *m)
{
seq_puts(m, "# _------=> CPU# \n");
@@ -1855,12 +1884,35 @@ static void print_lat_help_header(struct seq_file *m)
seq_puts(m, "# \\ / ||||| \\ | / \n");
}
-static void print_func_help_header(struct seq_file *m)
+static void print_event_info(struct trace_array *tr, struct seq_file *m)
+{
+ unsigned long total;
+ unsigned long entries;
+
+ get_total_entries(tr, &total, &entries);
+ seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n",
+ entries, total, num_online_cpus());
+ seq_puts(m, "#\n");
+}
+
+static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
{
- seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
+ print_event_info(tr, m);
+ seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
seq_puts(m, "# | | | | |\n");
}
+static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
+{
+ print_event_info(tr, m);
+ seq_puts(m, "# _-----=> irqs-off\n");
+ seq_puts(m, "# / _----=> need-resched\n");
+ seq_puts(m, "# | / _---=> hardirq/softirq\n");
+ seq_puts(m, "# || / _--=> preempt-depth\n");
+ seq_puts(m, "# ||| / delay\n");
+ seq_puts(m, "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n");
+ seq_puts(m, "# | | | |||| | |\n");
+}
void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
@@ -1869,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
struct trace_array *tr = iter->tr;
struct trace_array_cpu *data = tr->data[tr->cpu];
struct tracer *type = current_trace;
- unsigned long entries = 0;
- unsigned long total = 0;
- unsigned long count;
+ unsigned long entries;
+ unsigned long total;
const char *name = "preemption";
- int cpu;
if (type)
name = type->name;
-
- for_each_tracing_cpu(cpu) {
- count = ring_buffer_entries_cpu(tr->buffer, cpu);
- /*
- * If this buffer has skipped entries, then we hold all
- * entries for the trace and we need to ignore the
- * ones before the time stamp.
- */
- if (tr->data[cpu]->skipped_entries) {
- count -= tr->data[cpu]->skipped_entries;
- /* total is the same as the entries */
- total += count;
- } else
- total += count +
- ring_buffer_overrun_cpu(tr->buffer, cpu);
- entries += count;
- }
+ get_total_entries(tr, &total, &entries);
seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
name, UTS_RELEASE);
@@ -2140,6 +2174,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
return print_trace_fmt(iter);
}
+void trace_latency_header(struct seq_file *m)
+{
+ struct trace_iterator *iter = m->private;
+
+ /* print nothing if the buffers are empty */
+ if (trace_empty(iter))
+ return;
+
+ if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+ print_trace_header(m, iter);
+
+ if (!(trace_flags & TRACE_ITER_VERBOSE))
+ print_lat_help_header(m);
+}
+
void trace_default_header(struct seq_file *m)
{
struct trace_iterator *iter = m->private;
@@ -2155,8 +2204,12 @@ void trace_default_header(struct seq_file *m)
if (!(trace_flags & TRACE_ITER_VERBOSE))
print_lat_help_header(m);
} else {
- if (!(trace_flags & TRACE_ITER_VERBOSE))
- print_func_help_header(m);
+ if (!(trace_flags & TRACE_ITER_VERBOSE)) {
+ if (trace_flags & TRACE_ITER_IRQ_INFO)
+ print_func_help_header_irq(iter->tr, m);
+ else
+ print_func_help_header(iter->tr, m);
+ }
}
}
@@ -4385,7 +4438,7 @@ static const struct file_operations trace_options_core_fops = {
};
struct dentry *trace_create_file(const char *name,
- mode_t mode,
+ umode_t mode,
struct dentry *parent,
void *data,
const struct file_operations *fops)
@@ -4775,6 +4828,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
{
__ftrace_dump(true, oops_dump_mode);
}
+EXPORT_SYMBOL_GPL(ftrace_dump);
__init static int tracer_alloc_buffers(void)
{
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 092e1f8d18d..b93ecbadad6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -312,7 +312,7 @@ void tracing_reset_current(int cpu);
void tracing_reset_current_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
struct dentry *trace_create_file(const char *name,
- mode_t mode,
+ umode_t mode,
struct dentry *parent,
void *data,
const struct file_operations *fops);
@@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr,
unsigned long ip,
unsigned long parent_ip,
unsigned long flags, int pc);
+void trace_latency_header(struct seq_file *m);
void trace_default_header(struct seq_file *m);
void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
int trace_empty(struct trace_iterator *iter);
@@ -654,6 +655,7 @@ enum trace_iterator_flags {
TRACE_ITER_RECORD_CMD = 0x100000,
TRACE_ITER_OVERWRITE = 0x200000,
TRACE_ITER_STOP_ON_FREE = 0x400000,
+ TRACE_ITER_IRQ_INFO = 0x800000,
};
/*
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 581876f9f38..c212a7f934e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1078,7 +1078,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
/* First see if we did not already create this dir */
list_for_each_entry(system, &event_subsystems, list) {
if (strcmp(system->name, name) == 0) {
- __get_system(system);
system->nr_events++;
return system->entry;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 816d3d07497..24aee712745 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,6 +27,12 @@
#include "trace.h"
#include "trace_output.h"
+#define DEFAULT_SYS_FILTER_MESSAGE \
+ "### global filter ###\n" \
+ "# Use this to set filters for multiple events.\n" \
+ "# Only events with the given fields will be affected.\n" \
+ "# If no events are modified, an error message will be displayed here"
+
enum filter_op_ids
{
OP_OR,
@@ -646,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
if (filter && filter->filter_string)
trace_seq_printf(s, "%s\n", filter->filter_string);
else
- trace_seq_printf(s, "none\n");
+ trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
mutex_unlock(&event_mutex);
}
@@ -1649,7 +1655,9 @@ static int replace_system_preds(struct event_subsystem *system,
*/
err = replace_preds(call, NULL, ps, filter_string, true);
if (err)
- goto fail;
+ call->flags |= TRACE_EVENT_FL_NO_SET_FILTER;
+ else
+ call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER;
}
list_for_each_entry(call, &ftrace_events, list) {
@@ -1658,6 +1666,9 @@ static int replace_system_preds(struct event_subsystem *system,
if (strcmp(call->class->system, system->name) != 0)
continue;
+ if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER)
+ continue;
+
filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
if (!filter_item)
goto fail_mem;
@@ -1686,7 +1697,7 @@ static int replace_system_preds(struct event_subsystem *system,
* replace the filter for the call.
*/
filter = call->filter;
- call->filter = filter_item->filter;
+ rcu_assign_pointer(call->filter, filter_item->filter);
filter_item->filter = filter;
fail = false;
@@ -1727,11 +1738,121 @@ static int replace_system_preds(struct event_subsystem *system,
return -ENOMEM;
}
+static int create_filter_start(char *filter_str, bool set_str,
+ struct filter_parse_state **psp,
+ struct event_filter **filterp)
+{
+ struct event_filter *filter;
+ struct filter_parse_state *ps = NULL;
+ int err = 0;
+
+ WARN_ON_ONCE(*psp || *filterp);
+
+ /* allocate everything, and if any fails, free all and fail */
+ filter = __alloc_filter();
+ if (filter && set_str)
+ err = replace_filter_string(filter, filter_str);
+
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+
+ if (!filter || !ps || err) {
+ kfree(ps);
+ __free_filter(filter);
+ return -ENOMEM;
+ }
+
+ /* we're committed to creating a new filter */
+ *filterp = filter;
+ *psp = ps;
+
+ parse_init(ps, filter_ops, filter_str);
+ err = filter_parse(ps);
+ if (err && set_str)
+ append_filter_err(ps, filter);
+ return err;
+}
+
+static void create_filter_finish(struct filter_parse_state *ps)
+{
+ if (ps) {
+ filter_opstack_clear(ps);
+ postfix_clear(ps);
+ kfree(ps);
+ }
+}
+
+/**
+ * create_filter - create a filter for a ftrace_event_call
+ * @call: ftrace_event_call to create a filter for
+ * @filter_str: filter string
+ * @set_str: remember @filter_str and enable detailed error in filter
+ * @filterp: out param for created filter (always updated on return)
+ *
+ * Creates a filter for @call with @filter_str. If @set_str is %true,
+ * @filter_str is copied and recorded in the new filter.
+ *
+ * On success, returns 0 and *@filterp points to the new filter. On
+ * failure, returns -errno and *@filterp may point to %NULL or to a new
+ * filter. In the latter case, the returned filter contains error
+ * information if @set_str is %true and the caller is responsible for
+ * freeing it.
+ */
+static int create_filter(struct ftrace_event_call *call,
+ char *filter_str, bool set_str,
+ struct event_filter **filterp)
+{
+ struct event_filter *filter = NULL;
+ struct filter_parse_state *ps = NULL;
+ int err;
+
+ err = create_filter_start(filter_str, set_str, &ps, &filter);
+ if (!err) {
+ err = replace_preds(call, filter, ps, filter_str, false);
+ if (err && set_str)
+ append_filter_err(ps, filter);
+ }
+ create_filter_finish(ps);
+
+ *filterp = filter;
+ return err;
+}
+
+/**
+ * create_system_filter - create a filter for an event_subsystem
+ * @system: event_subsystem to create a filter for
+ * @filter_str: filter string
+ * @filterp: out param for created filter (always updated on return)
+ *
+ * Identical to create_filter() except that it creates a subsystem filter
+ * and always remembers @filter_str.
+ */
+static int create_system_filter(struct event_subsystem *system,
+ char *filter_str, struct event_filter **filterp)
+{
+ struct event_filter *filter = NULL;
+ struct filter_parse_state *ps = NULL;
+ int err;
+
+ err = create_filter_start(filter_str, true, &ps, &filter);
+ if (!err) {
+ err = replace_system_preds(system, ps, filter_str);
+ if (!err) {
+ /* System filters just show a default message */
+ kfree(filter->filter_string);
+ filter->filter_string = NULL;
+ } else {
+ append_filter_err(ps, filter);
+ }
+ }
+ create_filter_finish(ps);
+
+ *filterp = filter;
+ return err;
+}
+
int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
{
- struct filter_parse_state *ps;
struct event_filter *filter;
- struct event_filter *tmp;
int err = 0;
mutex_lock(&event_mutex);
@@ -1741,56 +1862,37 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
filter = call->filter;
if (!filter)
goto out_unlock;
- call->filter = NULL;
+ RCU_INIT_POINTER(call->filter, NULL);
/* Make sure the filter is not being used */
synchronize_sched();
__free_filter(filter);
goto out_unlock;
}
- err = -ENOMEM;
- ps = kzalloc(sizeof(*ps), GFP_KERNEL);
- if (!ps)
- goto out_unlock;
-
- filter = __alloc_filter();
- if (!filter) {
- kfree(ps);
- goto out_unlock;
- }
+ err = create_filter(call, filter_string, true, &filter);
- replace_filter_string(filter, filter_string);
-
- parse_init(ps, filter_ops, filter_string);
- err = filter_parse(ps);
- if (err) {
- append_filter_err(ps, filter);
- goto out;
- }
-
- err = replace_preds(call, filter, ps, filter_string, false);
- if (err) {
- filter_disable(call);
- append_filter_err(ps, filter);
- } else
- call->flags |= TRACE_EVENT_FL_FILTERED;
-out:
/*
* Always swap the call filter with the new filter
* even if there was an error. If there was an error
* in the filter, we disable the filter and show the error
* string
*/
- tmp = call->filter;
- call->filter = filter;
- if (tmp) {
- /* Make sure the call is done with the filter */
- synchronize_sched();
- __free_filter(tmp);
+ if (filter) {
+ struct event_filter *tmp = call->filter;
+
+ if (!err)
+ call->flags |= TRACE_EVENT_FL_FILTERED;
+ else
+ filter_disable(call);
+
+ rcu_assign_pointer(call->filter, filter);
+
+ if (tmp) {
+ /* Make sure the call is done with the filter */
+ synchronize_sched();
+ __free_filter(tmp);
+ }
}
- filter_opstack_clear(ps);
- postfix_clear(ps);
- kfree(ps);
out_unlock:
mutex_unlock(&event_mutex);
@@ -1800,7 +1902,6 @@ out_unlock:
int apply_subsystem_event_filter(struct event_subsystem *system,
char *filter_string)
{
- struct filter_parse_state *ps;
struct event_filter *filter;
int err = 0;
@@ -1824,38 +1925,15 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
goto out_unlock;
}
- err = -ENOMEM;
- ps = kzalloc(sizeof(*ps), GFP_KERNEL);
- if (!ps)
- goto out_unlock;
-
- filter = __alloc_filter();
- if (!filter)
- goto out;
-
- replace_filter_string(filter, filter_string);
- /*
- * No event actually uses the system filter
- * we can free it without synchronize_sched().
- */
- __free_filter(system->filter);
- system->filter = filter;
-
- parse_init(ps, filter_ops, filter_string);
- err = filter_parse(ps);
- if (err) {
- append_filter_err(ps, system->filter);
- goto out;
+ err = create_system_filter(system, filter_string, &filter);
+ if (filter) {
+ /*
+ * No event actually uses the system filter
+ * we can free it without synchronize_sched().
+ */
+ __free_filter(system->filter);
+ system->filter = filter;
}
-
- err = replace_system_preds(system, ps, filter_string);
- if (err)
- append_filter_err(ps, system->filter);
-
-out:
- filter_opstack_clear(ps);
- postfix_clear(ps);
- kfree(ps);
out_unlock:
mutex_unlock(&event_mutex);
@@ -1877,7 +1955,6 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
{
int err;
struct event_filter *filter;
- struct filter_parse_state *ps;
struct ftrace_event_call *call;
mutex_lock(&event_mutex);
@@ -1892,33 +1969,10 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
if (event->filter)
goto out_unlock;
- filter = __alloc_filter();
- if (!filter) {
- err = PTR_ERR(filter);
- goto out_unlock;
- }
-
- err = -ENOMEM;
- ps = kzalloc(sizeof(*ps), GFP_KERNEL);
- if (!ps)
- goto free_filter;
-
- parse_init(ps, filter_ops, filter_str);
- err = filter_parse(ps);
- if (err)
- goto free_ps;
-
- err = replace_preds(call, filter, ps, filter_str, false);
+ err = create_filter(call, filter_str, false, &filter);
if (!err)
event->filter = filter;
-
-free_ps:
- filter_opstack_clear(ps);
- postfix_clear(ps);
- kfree(ps);
-
-free_filter:
- if (err)
+ else
__free_filter(filter);
out_unlock:
@@ -1937,43 +1991,6 @@ out_unlock:
#define CREATE_TRACE_POINTS
#include "trace_events_filter_test.h"
-static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
- struct event_filter **pfilter)
-{
- struct event_filter *filter;
- struct filter_parse_state *ps;
- int err = -ENOMEM;
-
- filter = __alloc_filter();
- if (!filter)
- goto out;
-
- ps = kzalloc(sizeof(*ps), GFP_KERNEL);
- if (!ps)
- goto free_filter;
-
- parse_init(ps, filter_ops, filter_str);
- err = filter_parse(ps);
- if (err)
- goto free_ps;
-
- err = replace_preds(call, filter, ps, filter_str, false);
- if (!err)
- *pfilter = filter;
-
- free_ps:
- filter_opstack_clear(ps);
- postfix_clear(ps);
- kfree(ps);
-
- free_filter:
- if (err)
- __free_filter(filter);
-
- out:
- return err;
-}
-
#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
{ \
.filter = FILTER, \
@@ -2092,12 +2109,13 @@ static __init int ftrace_test_event_filter(void)
struct test_filter_data_t *d = &test_filter_data[i];
int err;
- err = test_get_filter(d->filter, &event_ftrace_test_filter,
- &filter);
+ err = create_filter(&event_ftrace_test_filter, d->filter,
+ false, &filter);
if (err) {
printk(KERN_INFO
"Failed to get filter for '%s', err %d\n",
d->filter, err);
+ __free_filter(filter);
break;
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 20dad0d7a16..99d20e92036 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
}
static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
-static void irqsoff_print_header(struct seq_file *s) { }
static void irqsoff_trace_open(struct trace_iterator *iter) { }
static void irqsoff_trace_close(struct trace_iterator *iter) { }
+
+#ifdef CONFIG_FUNCTION_TRACER
+static void irqsoff_print_header(struct seq_file *s)
+{
+ trace_default_header(s);
+}
+#else
+static void irqsoff_print_header(struct seq_file *s)
+{
+ trace_latency_header(s);
+}
+#endif /* CONFIG_FUNCTION_TRACER */
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
/*
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 51999309a6c..0d6ff355594 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter)
unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned long secs = (unsigned long)t;
char comm[TASK_COMM_LEN];
+ int ret;
trace_find_cmdline(entry->pid, comm);
- return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
- comm, entry->pid, iter->cpu, secs, usec_rem);
+ ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
+ comm, entry->pid, iter->cpu);
+ if (!ret)
+ return 0;
+
+ if (trace_flags & TRACE_ITER_IRQ_INFO) {
+ ret = trace_print_lat_fmt(s, entry);
+ if (!ret)
+ return 0;
+ }
+
+ return trace_seq_printf(s, " %5lu.%06lu: ",
+ secs, usec_rem);
}
int trace_print_lat_context(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e4a70c0c71b..ff791ea48b5 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
}
static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
-static void wakeup_print_header(struct seq_file *s) { }
static void wakeup_trace_open(struct trace_iterator *iter) { }
static void wakeup_trace_close(struct trace_iterator *iter) { }
+
+#ifdef CONFIG_FUNCTION_TRACER
+static void wakeup_print_header(struct seq_file *s)
+{
+ trace_default_header(s);
+}
+#else
+static void wakeup_print_header(struct seq_file *s)
+{
+ trace_latency_header(s);
+}
+#endif /* CONFIG_FUNCTION_TRACER */
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
/*
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 77575b386d9..d4545f49242 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,6 +13,9 @@
#include <linux/sysctl.h>
#include <linux/init.h>
#include <linux/fs.h>
+
+#include <asm/setup.h>
+
#include "trace.h"
#define STACK_TRACE_ENTRIES 500
@@ -133,7 +136,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops trace_ops __read_mostly =
{
.func = stack_trace_call,
- .flags = FTRACE_OPS_FL_GLOBAL,
};
static ssize_t
@@ -311,6 +313,21 @@ static const struct file_operations stack_trace_fops = {
.release = seq_release,
};
+static int
+stack_trace_filter_open(struct inode *inode, struct file *file)
+{
+ return ftrace_regex_open(&trace_ops, FTRACE_ITER_FILTER,
+ inode, file);
+}
+
+static const struct file_operations stack_trace_filter_fops = {
+ .open = stack_trace_filter_open,
+ .read = seq_read,
+ .write = ftrace_filter_write,
+ .llseek = ftrace_regex_lseek,
+ .release = ftrace_regex_release,
+};
+
int
stack_trace_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -338,8 +355,13 @@ stack_trace_sysctl(struct ctl_table *table, int write,
return ret;
}
+static char stack_trace_filter_buf[COMMAND_LINE_SIZE+1] __initdata;
+
static __init int enable_stacktrace(char *str)
{
+ if (strncmp(str, "_filter=", 8) == 0)
+ strncpy(stack_trace_filter_buf, str+8, COMMAND_LINE_SIZE);
+
stack_tracer_enabled = 1;
last_stack_tracer_enabled = 1;
return 1;
@@ -358,6 +380,12 @@ static __init int stack_trace_init(void)
trace_create_file("stack_trace", 0444, d_tracer,
NULL, &stack_trace_fops);
+ trace_create_file("stack_trace_filter", 0444, d_tracer,
+ NULL, &stack_trace_filter_fops);
+
+ if (stack_trace_filter_buf[0])
+ ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
+
if (stack_tracer_enabled)
register_ftrace_function(&trace_ops);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index db110b8ae03..f1539decd99 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -634,10 +634,11 @@ static int tracepoint_module_coming(struct module *mod)
int ret = 0;
/*
- * We skip modules that tain the kernel, especially those with different
- * module header (for forced load), to make sure we don't cause a crash.
+ * We skip modules that taint the kernel, especially those with different
+ * module headers (for forced load), to make sure we don't cause a crash.
+ * Staging and out-of-tree GPL modules are fine.
*/
- if (mod->taints)
+ if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)))
return 0;
mutex_lock(&tracepoints_mutex);
tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 5bbfac85866..23b4d784ebd 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk)
local_irq_save(flags);
time = tsk->stime + tsk->utime;
- dtime = cputime_sub(time, tsk->acct_timexpd);
+ dtime = time - tsk->acct_timexpd;
jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
delta = value.tv_sec;
delta = delta * USEC_PER_SEC + value.tv_usec;
diff --git a/kernel/wait.c b/kernel/wait.c
index 26fa7797f90..7fdd9eaca2c 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,10 +10,10 @@
#include <linux/wait.h>
#include <linux/hash.h>
-void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
+void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
{
spin_lock_init(&q->lock);
- lockdep_set_class(&q->lock, key);
+ lockdep_set_class_and_name(&q->lock, key, name);
INIT_LIST_HEAD(&q->task_list);
}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 1d7bca7f4f5..d117262deba 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -296,7 +296,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
if (__this_cpu_read(soft_watchdog_warn) == true)
return HRTIMER_RESTART;
- printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+ printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
print_modules();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 42fa9ad0a81..bec7b5b53e0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -242,10 +242,10 @@ struct workqueue_struct {
int nr_drainers; /* W: drain in progress */
int saved_max_active; /* W: saved cwq max_active */
- const char *name; /* I: workqueue name */
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
+ char name[]; /* I: workqueue name */
};
struct workqueue_struct *system_wq __read_mostly;
@@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
return clamp_val(max_active, 1, lim);
}
-struct workqueue_struct *__alloc_workqueue_key(const char *name,
+struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
unsigned int flags,
int max_active,
struct lock_class_key *key,
- const char *lock_name)
+ const char *lock_name, ...)
{
+ va_list args, args1;
struct workqueue_struct *wq;
unsigned int cpu;
+ size_t namelen;
+
+ /* determine namelen, allocate wq and format name */
+ va_start(args, lock_name);
+ va_copy(args1, args);
+ namelen = vsnprintf(NULL, 0, fmt, args) + 1;
+
+ wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
+ if (!wq)
+ goto err;
+
+ vsnprintf(wq->name, namelen, fmt, args1);
+ va_end(args);
+ va_end(args1);
/*
* Workqueues which may be used during memory reclaim should
@@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
flags |= WQ_HIGHPRI;
max_active = max_active ?: WQ_DFL_ACTIVE;
- max_active = wq_clamp_max_active(max_active, flags, name);
-
- wq = kzalloc(sizeof(*wq), GFP_KERNEL);
- if (!wq)
- goto err;
+ max_active = wq_clamp_max_active(max_active, flags, wq->name);
+ /* init wq */
wq->flags = flags;
wq->saved_max_active = max_active;
mutex_init(&wq->flush_mutex);
@@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
INIT_LIST_HEAD(&wq->flusher_queue);
INIT_LIST_HEAD(&wq->flusher_overflow);
- wq->name = name;
lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
INIT_LIST_HEAD(&wq->list);
@@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
if (!rescuer)
goto err;
- rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
+ rescuer->task = kthread_create(rescuer_thread, wq, "%s",
+ wq->name);
if (IS_ERR(rescuer->task))
goto err;