summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c24
-rw-r--r--kernel/auditsc.c24
-rw-r--r--kernel/cpu.c32
-rw-r--r--kernel/cpuset.c67
-rw-r--r--kernel/delayacct.c6
-rw-r--r--kernel/die_notifier.c38
-rw-r--r--kernel/exit.c20
-rw-r--r--kernel/fork.c93
-rw-r--r--kernel/futex.c108
-rw-r--r--kernel/hrtimer.c34
-rw-r--r--kernel/irq/chip.c3
-rw-r--r--kernel/irq/devres.c2
-rw-r--r--kernel/irq/handle.c4
-rw-r--r--kernel/irq/manage.c10
-rw-r--r--kernel/irq/proc.c15
-rw-r--r--kernel/irq/spurious.c4
-rw-r--r--kernel/itimer.c60
-rw-r--r--kernel/kallsyms.c104
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/kmod.c7
-rw-r--r--kernel/kprobes.c293
-rw-r--r--kernel/ksysfs.c12
-rw-r--r--kernel/lockdep.c59
-rw-r--r--kernel/module.c125
-rw-r--r--kernel/nsproxy.c139
-rw-r--r--kernel/params.c8
-rw-r--r--kernel/pid.c15
-rw-r--r--kernel/posix-cpu-timers.c14
-rw-r--r--kernel/posix-timers.c1
-rw-r--r--kernel/power/Kconfig11
-rw-r--r--kernel/power/console.c10
-rw-r--r--kernel/power/disk.c171
-rw-r--r--kernel/power/main.c70
-rw-r--r--kernel/power/power.h49
-rw-r--r--kernel/power/process.c12
-rw-r--r--kernel/power/snapshot.c310
-rw-r--r--kernel/power/swap.c61
-rw-r--r--kernel/power/swsusp.c141
-rw-r--r--kernel/power/user.c48
-rw-r--r--kernel/printk.c27
-rw-r--r--kernel/rcutorture.c45
-rw-r--r--kernel/resource.c21
-rw-r--r--kernel/rwsem.c2
-rw-r--r--kernel/sched.c411
-rw-r--r--kernel/signal.c7
-rw-r--r--kernel/softlockup.c48
-rw-r--r--kernel/stop_machine.c8
-rw-r--r--kernel/sys.c21
-rw-r--r--kernel/sysctl.c13
-rw-r--r--kernel/taskstats.c8
-rw-r--r--kernel/time.c65
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c69
-rw-r--r--kernel/time/clocksource.c3
-rw-r--r--kernel/time/jiffies.c2
-rw-r--r--kernel/time/ntp.c30
-rw-r--r--kernel/time/tick-broadcast.c27
-rw-r--r--kernel/time/tick-common.c21
-rw-r--r--kernel/time/tick-internal.h12
-rw-r--r--kernel/time/tick-oneshot.c12
-rw-r--r--kernel/time/tick-sched.c51
-rw-r--r--kernel/time/timekeeping.c476
-rw-r--r--kernel/time/timer_list.c21
-rw-r--r--kernel/time/timer_stats.c14
-rw-r--r--kernel/timer.c549
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/utsname.c41
68 files changed, 2548 insertions, 1669 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ac6b27abb1a..642d4277c2e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o latency.o nsproxy.o srcu.o
+ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
diff --git a/kernel/audit.c b/kernel/audit.c
index 76c9a11b72d..d13276d4141 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -151,7 +151,7 @@ struct audit_buffer {
static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
{
- struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
+ struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
nlh->nlmsg_pid = pid;
}
@@ -515,8 +515,8 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
err = -EPERM;
break;
case AUDIT_USER:
- case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
- case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
+ case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
+ case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
err = -EPERM;
break;
@@ -614,8 +614,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
loginuid, sid);
break;
case AUDIT_USER:
- case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
- case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
+ case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
+ case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
@@ -750,7 +750,7 @@ static void audit_receive_skb(struct sk_buff *skb)
u32 rlen;
while (skb->len >= NLMSG_SPACE(0)) {
- nlh = (struct nlmsghdr *)skb->data;
+ nlh = nlmsg_hdr(skb);
if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
return;
rlen = NLMSG_ALIGN(nlh->nlmsg_len);
@@ -795,7 +795,7 @@ static int __init audit_init(void)
printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
audit_default ? "enabled" : "disabled");
audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
- THIS_MODULE);
+ NULL, THIS_MODULE);
if (!audit_sock)
audit_panic("cannot initialize netlink socket");
else
@@ -1073,7 +1073,7 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
goto out;
}
va_copy(args2, args);
- len = vsnprintf(skb->tail, avail, fmt, args);
+ len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args);
if (len >= avail) {
/* The printk buffer is 1024 bytes long, so if we get
* here and AUDIT_BUFSIZ is at least 1024, then we can
@@ -1082,7 +1082,7 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
if (!avail)
goto out;
- len = vsnprintf(skb->tail, avail, fmt, args2);
+ len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
}
if (len > 0)
skb_put(skb, len);
@@ -1143,7 +1143,7 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
return;
}
- ptr = skb->tail;
+ ptr = skb_tail_pointer(skb);
for (i=0; i<len; i++) {
*ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */
*ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */
@@ -1175,7 +1175,7 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
if (!avail)
return;
}
- ptr = skb->tail;
+ ptr = skb_tail_pointer(skb);
*ptr++ = '"';
memcpy(ptr, string, slen);
ptr += slen;
@@ -1268,7 +1268,7 @@ void audit_log_end(struct audit_buffer *ab)
audit_log_lost("rate limit exceeded");
} else {
if (audit_pid) {
- struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
+ struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
skb_queue_tail(&audit_skb_queue, ab->skb);
ab->skb = NULL;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 359955800dd..628c7ac590a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -739,28 +739,26 @@ static inline void audit_free_context(struct audit_context *context)
void audit_log_task_context(struct audit_buffer *ab)
{
char *ctx = NULL;
- ssize_t len = 0;
+ unsigned len;
+ int error;
+ u32 sid;
+
+ selinux_get_task_sid(current, &sid);
+ if (!sid)
+ return;
- len = security_getprocattr(current, "current", NULL, 0);
- if (len < 0) {
- if (len != -EINVAL)
+ error = selinux_sid_to_string(sid, &ctx, &len);
+ if (error) {
+ if (error != -EINVAL)
goto error_path;
return;
}
- ctx = kmalloc(len, GFP_KERNEL);
- if (!ctx)
- goto error_path;
-
- len = security_getprocattr(current, "current", ctx, len);
- if (len < 0 )
- goto error_path;
-
audit_log_format(ab, " subj=%s", ctx);
+ kfree(ctx);
return;
error_path:
- kfree(ctx);
audit_panic("error in audit_log_task_context");
return;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3d4206ada5c..36e70845cfc 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -254,6 +254,12 @@ int __cpuinit cpu_up(unsigned int cpu)
}
#ifdef CONFIG_SUSPEND_SMP
+/* Needed to prevent the microcode driver from requesting firmware in its CPU
+ * hotplug notifier during the suspend/resume.
+ */
+int suspend_cpu_hotplug;
+EXPORT_SYMBOL(suspend_cpu_hotplug);
+
static cpumask_t frozen_cpus;
int disable_nonboot_cpus(void)
@@ -261,16 +267,8 @@ int disable_nonboot_cpus(void)
int cpu, first_cpu, error = 0;
mutex_lock(&cpu_add_remove_lock);
- first_cpu = first_cpu(cpu_present_map);
- if (!cpu_online(first_cpu)) {
- error = _cpu_up(first_cpu);
- if (error) {
- printk(KERN_ERR "Could not bring CPU%d up.\n",
- first_cpu);
- goto out;
- }
- }
-
+ suspend_cpu_hotplug = 1;
+ first_cpu = first_cpu(cpu_online_map);
/* We take down all of the non-boot CPUs in one shot to avoid races
* with the userspace trying to use the CPU hotplug at the same time
*/
@@ -296,7 +294,7 @@ int disable_nonboot_cpus(void)
} else {
printk(KERN_ERR "Non-boot CPUs are not disabled\n");
}
-out:
+ suspend_cpu_hotplug = 0;
mutex_unlock(&cpu_add_remove_lock);
return error;
}
@@ -308,20 +306,22 @@ void enable_nonboot_cpus(void)
/* Allow everyone to use the CPU hotplug again */
mutex_lock(&cpu_add_remove_lock);
cpu_hotplug_disabled = 0;
- mutex_unlock(&cpu_add_remove_lock);
if (cpus_empty(frozen_cpus))
- return;
+ goto out;
+ suspend_cpu_hotplug = 1;
printk("Enabling non-boot CPUs ...\n");
for_each_cpu_mask(cpu, frozen_cpus) {
- error = cpu_up(cpu);
+ error = _cpu_up(cpu);
if (!error) {
printk("CPU%d is up\n", cpu);
continue;
}
- printk(KERN_WARNING "Error taking CPU%d up: %d\n",
- cpu, error);
+ printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
}
cpus_clear(frozen_cpus);
+ suspend_cpu_hotplug = 0;
+out:
+ mutex_unlock(&cpu_add_remove_lock);
}
#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f382b0f775e..88b416dfbc7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -42,7 +42,6 @@
#include <linux/seq_file.h>
#include <linux/security.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
#include <linux/spinlock.h>
#include <linux/stat.h>
#include <linux/string.h>
@@ -822,11 +821,22 @@ static int update_cpumask(struct cpuset *cs, char *buf)
return -EACCES;
trialcs = *cs;
- retval = cpulist_parse(buf, trialcs.cpus_allowed);
- if (retval < 0)
- return retval;
+
+ /*
+ * We allow a cpuset's cpus_allowed to be empty; if it has attached
+ * tasks, we'll catch it later when we validate the change and return
+ * -ENOSPC.
+ */
+ if (!buf[0] || (buf[0] == '\n' && !buf[1])) {
+ cpus_clear(trialcs.cpus_allowed);
+ } else {
+ retval = cpulist_parse(buf, trialcs.cpus_allowed);
+ if (retval < 0)
+ return retval;
+ }
cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
- if (cpus_empty(trialcs.cpus_allowed))
+ /* cpus_allowed cannot be empty for a cpuset with attached tasks. */
+ if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed))
return -ENOSPC;
retval = validate_change(cs, &trialcs);
if (retval < 0)
@@ -919,16 +929,27 @@ static int update_nodemask(struct cpuset *cs, char *buf)
return -EACCES;
trialcs = *cs;
- retval = nodelist_parse(buf, trialcs.mems_allowed);
- if (retval < 0)
- goto done;
+
+ /*
+ * We allow a cpuset's mems_allowed to be empty; if it has attached
+ * tasks, we'll catch it later when we validate the change and return
+ * -ENOSPC.
+ */
+ if (!buf[0] || (buf[0] == '\n' && !buf[1])) {
+ nodes_clear(trialcs.mems_allowed);
+ } else {
+ retval = nodelist_parse(buf, trialcs.mems_allowed);
+ if (retval < 0)
+ goto done;
+ }
nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
oldmem = cs->mems_allowed;
if (nodes_equal(oldmem, trialcs.mems_allowed)) {
retval = 0; /* Too easy - nothing to do */
goto done;
}
- if (nodes_empty(trialcs.mems_allowed)) {
+ /* mems_allowed cannot be empty for a cpuset with attached tasks. */
+ if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) {
retval = -ENOSPC;
goto done;
}
@@ -2200,10 +2221,6 @@ void cpuset_fork(struct task_struct *child)
* it is holding that mutex while calling check_for_release(),
* which calls kmalloc(), so can't be called holding callback_mutex().
*
- * We don't need to task_lock() this reference to tsk->cpuset,
- * because tsk is already marked PF_EXITING, so attach_task() won't
- * mess with it, or task is a failed fork, never visible to attach_task.
- *
* the_top_cpuset_hack:
*
* Set the exiting tasks cpuset to the root cpuset (top_cpuset).
@@ -2242,8 +2259,10 @@ void cpuset_exit(struct task_struct *tsk)
{
struct cpuset *cs;
+ task_lock(current);
cs = tsk->cpuset;
tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */
+ task_unlock(current);
if (notify_on_release(cs)) {
char *pathbuf = NULL;
@@ -2351,6 +2370,8 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
* z's node is in our tasks mems_allowed, yes. If it's not a
* __GFP_HARDWALL request and this zone's nodes is in the nearest
* mem_exclusive cpuset ancestor to this tasks cpuset, yes.
+ * If the task has been OOM killed and has access to memory reserves
+ * as specified by the TIF_MEMDIE flag, yes.
* Otherwise, no.
*
* If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
@@ -2368,7 +2389,8 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
* calls get to this routine, we should just shut up and say 'yes'.
*
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
- * and do not allow allocations outside the current tasks cpuset.
+ * and do not allow allocations outside the current tasks cpuset
+ * unless the task has been OOM killed as is marked TIF_MEMDIE.
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing mem_exclusive ancestor cpuset.
*
@@ -2392,6 +2414,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
* affect that:
* in_interrupt - any node ok (current task context irrelevant)
* GFP_ATOMIC - any node ok
+ * TIF_MEMDIE - any node ok
* GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*
@@ -2413,6 +2436,12 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
if (node_isset(node, current->mems_allowed))
return 1;
+ /*
+ * Allow tasks that have access to memory reserves because they have
+ * been OOM killed to get memory anywhere.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ return 1;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
return 0;
@@ -2438,7 +2467,9 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
*
* If we're in interrupt, yes, we can always allocate.
* If __GFP_THISNODE is set, yes, we can always allocate. If zone
- * z's node is in our tasks mems_allowed, yes. Otherwise, no.
+ * z's node is in our tasks mems_allowed, yes. If the task has been
+ * OOM killed and has access to memory reserves as specified by the
+ * TIF_MEMDIE flag, yes. Otherwise, no.
*
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
@@ -2462,6 +2493,12 @@ int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
node = zone_to_nid(z);
if (node_isset(node, current->mems_allowed))
return 1;
+ /*
+ * Allow tasks that have access to memory reserves because they have
+ * been OOM killed to get memory anywhere.
+ */
+ if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ return 1;
return 0;
}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 766d5912b26..c0148ae992c 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -31,11 +31,7 @@ __setup("nodelayacct", delayacct_setup_disable);
void delayacct_init(void)
{
- delayacct_cache = kmem_cache_create("delayacct_cache",
- sizeof(struct task_delay_info),
- 0,
- SLAB_PANIC,
- NULL, NULL);
+ delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
delayacct_tsk_init(&init_task);
}
diff --git a/kernel/die_notifier.c b/kernel/die_notifier.c
new file mode 100644
index 00000000000..0d98827887a
--- /dev/null
+++ b/kernel/die_notifier.c
@@ -0,0 +1,38 @@
+
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/vmalloc.h>
+#include <linux/kdebug.h>
+
+
+static ATOMIC_NOTIFIER_HEAD(die_chain);
+
+int notify_die(enum die_val val, const char *str,
+ struct pt_regs *regs, long err, int trap, int sig)
+{
+ struct die_args args = {
+ .regs = regs,
+ .str = str,
+ .err = err,
+ .trapnr = trap,
+ .signr = sig,
+
+ };
+
+ return atomic_notifier_call_chain(&die_chain, val, &args);
+}
+
+int register_die_notifier(struct notifier_block *nb)
+{
+ vmalloc_sync_all();
+ return atomic_notifier_chain_register(&die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(register_die_notifier);
+
+int unregister_die_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&die_chain, nb);
+}
+EXPORT_SYMBOL_GPL(unregister_die_notifier);
+
+
diff --git a/kernel/exit.c b/kernel/exit.c
index f132349c032..f5a7abb621f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -7,7 +7,6 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
-#include <linux/smp_lock.h>
#include <linux/module.h>
#include <linux/capability.h>
#include <linux/completion.h>
@@ -790,7 +789,7 @@ static void exit_notify(struct task_struct *tsk)
pgrp = task_pgrp(tsk);
if ((task_pgrp(t) != pgrp) &&
- (task_session(t) != task_session(tsk)) &&
+ (task_session(t) == task_session(tsk)) &&
will_become_orphaned_pgrp(pgrp, tsk) &&
has_stopped_jobs(pgrp)) {
__kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp);
@@ -1033,6 +1032,8 @@ asmlinkage void sys_exit_group(int error_code)
static int eligible_child(pid_t pid, int options, struct task_struct *p)
{
+ int err;
+
if (pid > 0) {
if (p->pid != pid)
return 0;
@@ -1066,8 +1067,9 @@ static int eligible_child(pid_t pid, int options, struct task_struct *p)
if (delay_group_leader(p))
return 2;
- if (security_task_wait(p))
- return 0;
+ err = security_task_wait(p);
+ if (err)
+ return err;
return 1;
}
@@ -1449,6 +1451,7 @@ static long do_wait(pid_t pid, int options, struct siginfo __user *infop,
DECLARE_WAITQUEUE(wait, current);
struct task_struct *tsk;
int flag, retval;
+ int allowed, denied;
add_wait_queue(&current->signal->wait_chldexit,&wait);
repeat:
@@ -1457,6 +1460,7 @@ repeat:
* match our criteria, even if we are not able to reap it yet.
*/
flag = 0;
+ allowed = denied = 0;
current->state = TASK_INTERRUPTIBLE;
read_lock(&tasklist_lock);
tsk = current;
@@ -1472,6 +1476,12 @@ repeat:
if (!ret)
continue;
+ if (unlikely(ret < 0)) {
+ denied = ret;
+ continue;
+ }
+ allowed = 1;
+
switch (p->state) {
case TASK_TRACED:
/*
@@ -1570,6 +1580,8 @@ check_continued:
goto repeat;
}
retval = -ECHILD;
+ if (unlikely(denied) && !allowed)
+ retval = denied;
end:
current->state = TASK_RUNNING;
remove_wait_queue(&current->signal->wait_chldexit,&wait);
diff --git a/kernel/fork.c b/kernel/fork.c
index d154cc78648..a8dd75d4992 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -14,7 +14,6 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/unistd.h>
-#include <linux/smp_lock.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/completion.h>
@@ -286,6 +285,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
if (retval)
goto out;
}
+ /* a new mm has just been created */
+ arch_dup_mmap(oldmm, mm);
retval = 0;
out:
up_write(&mm->mmap_sem);
@@ -933,8 +934,8 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
static inline void rt_mutex_init_task(struct task_struct *p)
{
-#ifdef CONFIG_RT_MUTEXES
spin_lock_init(&p->pi_lock);
+#ifdef CONFIG_RT_MUTEXES
plist_head_init(&p->pi_waiters, &p->pi_lock);
p->pi_blocked_on = NULL;
#endif
@@ -1423,8 +1424,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long fl
{
struct sighand_struct *sighand = data;
- if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
- SLAB_CTOR_CONSTRUCTOR)
+ if (flags & SLAB_CTOR_CONSTRUCTOR)
spin_lock_init(&sighand->siglock);
}
@@ -1515,26 +1515,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
}
/*
- * Unshare the mnt_namespace structure if it is being shared
- */
-static int unshare_mnt_namespace(unsigned long unshare_flags,
- struct mnt_namespace **new_nsp, struct fs_struct *new_fs)
-{
- struct mnt_namespace *ns = current->nsproxy->mnt_ns;
-
- if ((unshare_flags & CLONE_NEWNS) && ns) {
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs);
- if (!*new_nsp)
- return -ENOMEM;
- }
-
- return 0;
-}
-
-/*
* Unsharing of sighand is not supported yet
*/
static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
@@ -1592,16 +1572,6 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n
return 0;
}
-#ifndef CONFIG_IPC_NS
-static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns)
-{
- if (flags & CLONE_NEWIPC)
- return -EINVAL;
-
- return 0;
-}
-#endif
-
/*
* unshare allows a process to 'unshare' part of the process
* context which was originally shared using clone. copy_*
@@ -1614,14 +1584,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
{
int err = 0;
struct fs_struct *fs, *new_fs = NULL;
- struct mnt_namespace *ns, *new_ns = NULL;
struct sighand_struct *new_sigh = NULL;
struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
struct files_struct *fd, *new_fd = NULL;
struct sem_undo_list *new_ulist = NULL;
struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL;
- struct uts_namespace *uts, *new_uts = NULL;
- struct ipc_namespace *ipc, *new_ipc = NULL;
check_unshare_flags(&unshare_flags);
@@ -1636,36 +1603,24 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
goto bad_unshare_out;
if ((err = unshare_fs(unshare_flags, &new_fs)))
goto bad_unshare_cleanup_thread;
- if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs)))
- goto bad_unshare_cleanup_fs;
if ((err = unshare_sighand(unshare_flags, &new_sigh)))
- goto bad_unshare_cleanup_ns;
+ goto bad_unshare_cleanup_fs;
if ((err = unshare_vm(unshare_flags, &new_mm)))
goto bad_unshare_cleanup_sigh;
if ((err = unshare_fd(unshare_flags, &new_fd)))
goto bad_unshare_cleanup_vm;
if ((err = unshare_semundo(unshare_flags, &new_ulist)))
goto bad_unshare_cleanup_fd;
- if ((err = unshare_utsname(unshare_flags, &new_uts)))
+ if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
+ new_fs)))
goto bad_unshare_cleanup_semundo;
- if ((err = unshare_ipcs(unshare_flags, &new_ipc)))
- goto bad_unshare_cleanup_uts;
-
- if (new_ns || new_uts || new_ipc) {
- old_nsproxy = current->nsproxy;
- new_nsproxy = dup_namespaces(old_nsproxy);
- if (!new_nsproxy) {
- err = -ENOMEM;
- goto bad_unshare_cleanup_ipc;
- }
- }
- if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
- new_uts || new_ipc) {
+ if (new_fs || new_mm || new_fd || new_ulist || new_nsproxy) {
task_lock(current);
if (new_nsproxy) {
+ old_nsproxy = current->nsproxy;
current->nsproxy = new_nsproxy;
new_nsproxy = old_nsproxy;
}
@@ -1676,12 +1631,6 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
new_fs = fs;
}
- if (new_ns) {
- ns = current->nsproxy->mnt_ns;
- current->nsproxy->mnt_ns = new_ns;
- new_ns = ns;
- }
-
if (new_mm) {
mm = current->mm;
active_mm = current->active_mm;
@@ -1697,32 +1646,12 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
new_fd = fd;
}
- if (new_uts) {
- uts = current->nsproxy->uts_ns;
- current->nsproxy->uts_ns = new_uts;
- new_uts = uts;
- }
-
- if (new_ipc) {
- ipc = current->nsproxy->ipc_ns;
- current->nsproxy->ipc_ns = new_ipc;
- new_ipc = ipc;
- }
-
task_unlock(current);
}
if (new_nsproxy)
put_nsproxy(new_nsproxy);
-bad_unshare_cleanup_ipc:
- if (new_ipc)
- put_ipc_ns(new_ipc);
-
-bad_unshare_cleanup_uts:
- if (new_uts)
- put_uts_ns(new_uts);
-
bad_unshare_cleanup_semundo:
bad_unshare_cleanup_fd:
if (new_fd)
@@ -1737,10 +1666,6 @@ bad_unshare_cleanup_sigh:
if (atomic_dec_and_test(&new_sigh->count))
kmem_cache_free(sighand_cachep, new_sigh);
-bad_unshare_cleanup_ns:
- if (new_ns)
- put_mnt_ns(new_ns);
-
bad_unshare_cleanup_fs:
if (new_fs)
put_fs_struct(new_fs);
diff --git a/kernel/futex.c b/kernel/futex.c
index e749e7df14b..600bc9d801f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -48,6 +48,7 @@
#include <linux/pagemap.h>
#include <linux/syscalls.h>
#include <linux/signal.h>
+#include <linux/module.h>
#include <asm/futex.h>
#include "rtmutex_common.h"
@@ -55,32 +56,6 @@
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
/*
- * Futexes are matched on equal values of this key.
- * The key type depends on whether it's a shared or private mapping.
- * Don't rearrange members without looking at hash_futex().
- *
- * offset is aligned to a multiple of sizeof(u32) (== 4) by definition.
- * We set bit 0 to indicate if it's an inode-based key.
- */
-union futex_key {
- struct {
- unsigned long pgoff;
- struct inode *inode;
- int offset;
- } shared;
- struct {
- unsigned long address;
- struct mm_struct *mm;
- int offset;
- } private;
- struct {
- unsigned long word;
- void *ptr;
- int offset;
- } both;
-};
-
-/*
* Priority Inheritance state:
*/
struct futex_pi_state {
@@ -175,7 +150,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
*
* Should be called with &current->mm->mmap_sem but NOT any spinlocks.
*/
-static int get_futex_key(u32 __user *uaddr, union futex_key *key)
+int get_futex_key(u32 __user *uaddr, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -246,6 +221,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
}
return err;
}
+EXPORT_SYMBOL_GPL(get_futex_key);
/*
* Take a reference to the resource addressed by a key.
@@ -254,7 +230,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
* NOTE: mmap_sem MUST be held between get_futex_key() and calling this
* function, if it is called at all. mmap_sem keeps key->shared.inode valid.
*/
-static inline void get_key_refs(union futex_key *key)
+inline void get_futex_key_refs(union futex_key *key)
{
if (key->both.ptr != 0) {
if (key->both.offset & 1)
@@ -263,12 +239,13 @@ static inline void get_key_refs(union futex_key *key)
atomic_inc(&key->private.mm->mm_count);
}
}
+EXPORT_SYMBOL_GPL(get_futex_key_refs);
/*
* Drop a reference to the resource addressed by a key.
* The hash bucket spinlock must not be held.
*/
-static void drop_key_refs(union futex_key *key)
+void drop_futex_key_refs(union futex_key *key)
{
if (key->both.ptr != 0) {
if (key->both.offset & 1)
@@ -277,6 +254,7 @@ static void drop_key_refs(union futex_key *key)
mmdrop(key->private.mm);
}
}
+EXPORT_SYMBOL_GPL(drop_futex_key_refs);
static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
{
@@ -565,6 +543,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
if (!pi_state)
return -EINVAL;
+ spin_lock(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/*
@@ -604,6 +583,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
pi_state->owner = new_owner;
spin_unlock_irq(&new_owner->pi_lock);
+ spin_unlock(&pi_state->pi_mutex.wait_lock);
rt_mutex_unlock(&pi_state->pi_mutex);
return 0;
@@ -871,7 +851,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
this->lock_ptr = &hb2->lock;
}
this->key = key2;
- get_key_refs(&key2);
+ get_futex_key_refs(&key2);
drop_count++;
if (ret - nr_wake >= nr_requeue)
@@ -884,9 +864,9 @@ out_unlock:
if (hb1 != hb2)
spin_unlock(&hb2->lock);
- /* drop_key_refs() must be called outside the spinlocks. */
+ /* drop_futex_key_refs() must be called outside the spinlocks. */
while (--drop_count >= 0)
- drop_key_refs(&key1);
+ drop_futex_key_refs(&key1);
out:
up_read(&current->mm->mmap_sem);
@@ -904,7 +884,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
init_waitqueue_head(&q->waiters);
- get_key_refs(&q->key);
+ get_futex_key_refs(&q->key);
hb = hash_futex(&q->key);
q->lock_ptr = &hb->lock;
@@ -923,7 +903,7 @@ static inline void
queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
{
spin_unlock(&hb->lock);
- drop_key_refs(&q->key);
+ drop_futex_key_refs(&q->key);
}
/*
@@ -978,7 +958,7 @@ static int unqueue_me(struct futex_q *q)
ret = 1;
}
- drop_key_refs(&q->key);
+ drop_futex_key_refs(&q->key);
return ret;
}
@@ -997,15 +977,18 @@ static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
spin_unlock(&hb->lock);
- drop_key_refs(&q->key);
+ drop_futex_key_refs(&q->key);
}
-static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
+static long futex_wait_restart(struct restart_block *restart);
+static int futex_wait_abstime(u32 __user *uaddr, u32 val,
+ int timed, unsigned long abs_time)
{
struct task_struct *curr = current;
DECLARE_WAITQUEUE(wait, curr);
struct futex_hash_bucket *hb;
struct futex_q q;
+ unsigned long time_left = 0;
u32 uval;
int ret;
@@ -1085,8 +1068,21 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
* !list_empty() is safe here without any lock.
* q.lock_ptr != 0 is not safe, because of ordering against wakeup.
*/
- if (likely(!list_empty(&q.list)))
- time = schedule_timeout(time);
+ time_left = 0;
+ if (likely(!list_empty(&q.list))) {
+ unsigned long rel_time;
+
+ if (timed) {
+ unsigned long now = jiffies;
+ if (time_after(now, abs_time))
+ rel_time = 0;
+ else
+ rel_time = abs_time - now;
+ } else
+ rel_time = MAX_SCHEDULE_TIMEOUT;
+
+ time_left = schedule_timeout(rel_time);
+ }
__set_current_state(TASK_RUNNING);
/*
@@ -1097,13 +1093,25 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
/* If we were woken (and unqueued), we succeeded, whatever. */
if (!unqueue_me(&q))
return 0;
- if (time == 0)
+ if (time_left == 0)
return -ETIMEDOUT;
+
/*
* We expect signal_pending(current), but another thread may
* have handled it for us already.
*/
- return -EINTR;
+ if (time_left == MAX_SCHEDULE_TIMEOUT)
+ return -ERESTARTSYS;
+ else {
+ struct restart_block *restart;
+ restart = &current_thread_info()->restart_block;
+ restart->fn = futex_wait_restart;
+ restart->arg0 = (unsigned long)uaddr;
+ restart->arg1 = (unsigned long)val;
+ restart->arg2 = (unsigned long)timed;
+ restart->arg3 = abs_time;
+ return -ERESTART_RESTARTBLOCK;
+ }
out_unlock_release_sem:
queue_unlock(&q, hb);
@@ -1113,6 +1121,24 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
return ret;
}
+static int futex_wait(u32 __user *uaddr, u32 val, unsigned long rel_time)
+{
+ int timed = (rel_time != MAX_SCHEDULE_TIMEOUT);
+ return futex_wait_abstime(uaddr, val, timed, jiffies+rel_time);
+}
+
+static long futex_wait_restart(struct restart_block *restart)
+{
+ u32 __user *uaddr = (u32 __user *)restart->arg0;
+ u32 val = (u32)restart->arg1;
+ int timed = (int)restart->arg2;
+ unsigned long abs_time = restart->arg3;
+
+ restart->fn = do_no_restart_syscall;
+ return (long)futex_wait_abstime(uaddr, val, timed, abs_time);
+}
+
+
/*
* Userspace tried a 0 -> TID atomic transition of the futex value
* and failed. The kernel side here does the whole locking operation:
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ec4cb9f3e3b..c9f4f044a8a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -59,6 +59,7 @@ ktime_t ktime_get(void)
return timespec_to_ktime(now);
}
+EXPORT_SYMBOL_GPL(ktime_get);
/**
* ktime_get_real - get the real (wall-) time in ktime_t format
@@ -135,7 +136,7 @@ EXPORT_SYMBOL_GPL(ktime_get_ts);
static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
{
ktime_t xtim, tomono;
- struct timespec xts;
+ struct timespec xts, tom;
unsigned long seq;
do {
@@ -145,10 +146,11 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
#else
xts = xtime;
#endif
+ tom = wall_to_monotonic;
} while (read_seqretry(&xtime_lock, seq));
xtim = timespec_to_ktime(xts);
- tomono = timespec_to_ktime(wall_to_monotonic);
+ tomono = timespec_to_ktime(tom);
base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
base->clock_base[CLOCK_MONOTONIC].softirq_time =
ktime_add(xtim, tomono);
@@ -277,6 +279,8 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
return ktime_add(kt, tmp);
}
+
+EXPORT_SYMBOL_GPL(ktime_add_ns);
# endif /* !CONFIG_KTIME_SCALAR */
/*
@@ -458,6 +462,18 @@ void clock_was_set(void)
}
/*
+ * During resume we might have to reprogram the high resolution timer
+ * interrupt (on the local CPU):
+ */
+void hres_timers_resume(void)
+{
+ WARN_ON_ONCE(num_online_cpus() > 1);
+
+ /* Retrigger the CPU local events: */
+ retrigger_next_event(NULL);
+}
+
+/*
* Check, whether the timer is on the callback pending list
*/
static inline int hrtimer_cb_pending(const struct hrtimer *timer)
@@ -644,9 +660,16 @@ hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
orun++;
}
timer->expires = ktime_add(timer->expires, interval);
+ /*
+ * Make sure, that the result did not wrap with a very large
+ * interval.
+ */
+ if (timer->expires.tv64 < 0)
+ timer->expires = ktime_set(KTIME_SEC_MAX, 0);
return orun;
}
+EXPORT_SYMBOL_GPL(hrtimer_forward);
/*
* enqueue_hrtimer - internal function to (re)start a timer
@@ -807,7 +830,12 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
timer_stats_hrtimer_set_start_info(timer);
- enqueue_hrtimer(timer, new_base, base == new_base);
+ /*
+ * Only allow reprogramming if the new base is on this CPU.
+ * (it might still be on another CPU if the timer was pending)
+ */
+ enqueue_hrtimer(timer, new_base,
+ new_base->cpu_base == &__get_cpu_var(hrtimer_bases));
unlock_hrtimer_base(timer, &flags);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 0133f4f9e9f..615ce97c6cf 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -11,6 +11,7 @@
*/
#include <linux/irq.h>
+#include <linux/msi.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
@@ -185,6 +186,8 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
desc = irq_desc + irq;
spin_lock_irqsave(&desc->lock, flags);
desc->msi_desc = entry;
+ if (entry)
+ entry->irq = irq;
spin_unlock_irqrestore(&desc->lock, flags);
return 0;
}
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 85a430da0fb..d8ee241115f 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -54,7 +54,7 @@ int devm_request_irq(struct device *dev, unsigned int irq,
rc = request_irq(irq, handler, irqflags, devname, dev_id);
if (rc) {
- kfree(dr);
+ devres_free(dr);
return rc;
}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index aff1f0fabb0..32e1ab1477d 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -48,7 +48,7 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc)
*
* Controller mappings for all interrupt sources:
*/
-struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
+struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
[0 ... NR_IRQS-1] = {
.status = IRQ_DISABLED,
.chip = &no_irq_chip,
@@ -180,6 +180,8 @@ fastcall unsigned int __do_IRQ(unsigned int irq)
if (desc->chip->ack)
desc->chip->ack(irq);
action_ret = handle_IRQ_event(irq, desc->action);
+ if (!noirqdebug)
+ note_interrupt(irq, desc, action_ret);
desc->chip->end(irq);
return 1;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5597c157442..203a518b6f1 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -317,10 +317,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
}
*p = new;
-#if defined(CONFIG_IRQ_PER_CPU)
- if (new->flags & IRQF_PERCPU)
- desc->status |= IRQ_PER_CPU;
-#endif
+
/* Exclude IRQ from balancing */
if (new->flags & IRQF_NOBALANCING)
desc->status |= IRQ_NO_BALANCING;
@@ -328,6 +325,11 @@ int setup_irq(unsigned int irq, struct irqaction *new)
if (!shared) {
irq_chip_set_defaults(desc->chip);
+#if defined(CONFIG_IRQ_PER_CPU)
+ if (new->flags & IRQF_PERCPU)
+ desc->status |= IRQ_PER_CPU;
+#endif
+
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
if (desc->chip && desc->chip->set_type)
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 2db91eb54ad..ddde0ef9ccd 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -66,12 +66,19 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
{
struct irq_desc *desc = irq_desc + irq;
struct irqaction *action;
+ unsigned long flags;
+ int ret = 1;
- for (action = desc->action ; action; action = action->next)
+ spin_lock_irqsave(&desc->lock, flags);
+ for (action = desc->action ; action; action = action->next) {
if ((action != new_action) && action->name &&
- !strcmp(new_action->name, action->name))
- return 0;
- return 1;
+ !strcmp(new_action->name, action->name)) {
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&desc->lock, flags);
+ return ret;
}
void register_handler_proc(unsigned int irq, struct irqaction *action)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 9d8c79b4882..b0d81aae472 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -146,7 +146,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
if (unlikely(irqfixup)) {
/* Don't punish working computers */
- if ((irqfixup == 2 && irq == 0) || action_ret == IRQ_NONE) {
+ if ((irqfixup == 2 && ((irq == 0) ||
+ (desc->action->flags & IRQF_IRQPOLL))) ||
+ action_ret == IRQ_NONE) {
int ok = misrouted_irq(irq);
if (action_ret == IRQ_NONE)
desc->irqs_unhandled -= ok;
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 307c6a632ef..3205e8e114f 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -7,7 +7,6 @@
/* These are all the functions necessary to implement itimers */
#include <linux/mm.h>
-#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/syscalls.h>
#include <linux/time.h>
@@ -139,59 +138,11 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
}
/*
- * We do not care about correctness. We just sanitize the values so
- * the ktime_t operations which expect normalized values do not
- * break. This converts negative values to long timeouts similar to
- * the code in kernel versions < 2.6.16
- *
- * Print a limited number of warning messages when an invalid timeval
- * is detected.
- */
-static void fixup_timeval(struct timeval *tv, int interval)
-{
- static int warnlimit = 10;
- unsigned long tmp;
-
- if (warnlimit > 0) {
- warnlimit--;
- printk(KERN_WARNING
- "setitimer: %s (pid = %d) provided "
- "invalid timeval %s: tv_sec = %ld tv_usec = %ld\n",
- current->comm, current->pid,
- interval ? "it_interval" : "it_value",
- tv->tv_sec, (long) tv->tv_usec);
- }
-
- tmp = tv->tv_usec;
- if (tmp >= USEC_PER_SEC) {
- tv->tv_usec = tmp % USEC_PER_SEC;
- tv->tv_sec += tmp / USEC_PER_SEC;
- }
-
- tmp = tv->tv_sec;
- if (tmp > LONG_MAX)
- tv->tv_sec = LONG_MAX;
-}
-
-/*
* Returns true if the timeval is in canonical form
*/
#define timeval_valid(t) \
(((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC))
-/*
- * Check for invalid timevals, sanitize them and print a limited
- * number of warnings.
- */
-static void check_itimerval(struct itimerval *value) {
-
- if (unlikely(!timeval_valid(&value->it_value)))
- fixup_timeval(&value->it_value, 0);
-
- if (unlikely(!timeval_valid(&value->it_interval)))
- fixup_timeval(&value->it_interval, 1);
-}
-
int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
{
struct task_struct *tsk = current;
@@ -201,15 +152,10 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
/*
* Validate the timevals in value.
- *
- * Note: Although the spec requires that invalid values shall
- * return -EINVAL, we just fixup the value and print a limited
- * number of warnings in order not to break users of this
- * historical misfeature.
- *
- * Scheduled for replacement in March 2007
*/
- check_itimerval(value);
+ if (!timeval_valid(&value->it_value) ||
+ !timeval_valid(&value->it_interval))
+ return -EINVAL;
switch (which) {
case ITIMER_REAL:
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f294ff4f9e..f1bda23140b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -214,8 +214,10 @@ static unsigned long get_symbol_pos(unsigned long addr,
symbol_end = (unsigned long)_etext;
}
- *symbolsize = symbol_end - symbol_start;
- *offset = addr - symbol_start;
+ if (symbolsize)
+ *symbolsize = symbol_end - symbol_start;
+ if (offset)
+ *offset = addr - symbol_start;
return low;
}
@@ -267,27 +269,69 @@ const char *kallsyms_lookup(unsigned long addr,
return NULL;
}
-/* Replace "%s" in format with address, or returns -errno. */
-void __print_symbol(const char *fmt, unsigned long address)
+int lookup_symbol_name(unsigned long addr, char *symname)
+{
+ symname[0] = '\0';
+ symname[KSYM_NAME_LEN] = '\0';
+
+ if (is_ksym_addr(addr)) {
+ unsigned long pos;
+
+ pos = get_symbol_pos(addr, NULL, NULL);
+ /* Grab name */
+ kallsyms_expand_symbol(get_symbol_offset(pos), symname);
+ return 0;
+ }
+ /* see if it's in a module */
+ return lookup_module_symbol_name(addr, symname);
+}
+
+int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
+ unsigned long *offset, char *modname, char *name)
+{
+ name[0] = '\0';
+ name[KSYM_NAME_LEN] = '\0';
+
+ if (is_ksym_addr(addr)) {
+ unsigned long pos;
+
+ pos = get_symbol_pos(addr, size, offset);
+ /* Grab name */
+ kallsyms_expand_symbol(get_symbol_offset(pos), name);
+ modname[0] = '\0';
+ return 0;
+ }
+ /* see if it's in a module */
+ return lookup_module_symbol_attrs(addr, size, offset, modname, name);
+}
+
+/* Look up a kernel symbol and return it in a text buffer. */
+int sprint_symbol(char *buffer, unsigned long address)
{
char *modname;
const char *name;
unsigned long offset, size;
char namebuf[KSYM_NAME_LEN+1];
- char buffer[sizeof("%s+%#lx/%#lx [%s]") + KSYM_NAME_LEN +
- 2*(BITS_PER_LONG*3/10) + MODULE_NAME_LEN + 1];
name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
-
if (!name)
- sprintf(buffer, "0x%lx", address);
+ return sprintf(buffer, "0x%lx", address);
else {
if (modname)
- sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset,
+ return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset,
size, modname);
else
- sprintf(buffer, "%s+%#lx/%#lx", name, offset, size);
+ return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size);
}
+}
+
+/* Look up a kernel symbol and print it to the kernel messages. */
+void __print_symbol(const char *fmt, unsigned long address)
+{
+ char buffer[KSYM_SYMBOL_LEN];
+
+ sprint_symbol(buffer, address);
+
printk(fmt, buffer);
}
@@ -295,25 +339,20 @@ void __print_symbol(const char *fmt, unsigned long address)
struct kallsym_iter
{
loff_t pos;
- struct module *owner;
unsigned long value;
unsigned int nameoff; /* If iterating in core kernel symbols */
char type;
char name[KSYM_NAME_LEN+1];
+ char module_name[MODULE_NAME_LEN + 1];
+ int exported;
};
static int get_ksymbol_mod(struct kallsym_iter *iter)
{
- iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
- &iter->value, &iter->type,
- iter->name, sizeof(iter->name));
- if (iter->owner == NULL)
+ if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value,
+ &iter->type, iter->name, iter->module_name,
+ &iter->exported) < 0)
return 0;
-
- /* Label it "global" if it is exported, "local" if not exported. */
- iter->type = is_exported(iter->name, iter->owner)
- ? toupper(iter->type) : tolower(iter->type);
-
return 1;
}
@@ -322,7 +361,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
{
unsigned off = iter->nameoff;
- iter->owner = NULL;
+ iter->module_name[0] = '\0';
iter->value = kallsyms_addresses[iter->pos];
iter->type = kallsyms_get_symbol_type(off);
@@ -386,12 +425,17 @@ static int s_show(struct seq_file *m, void *p)
if (!iter->name[0])
return 0;
- if (iter->owner)
+ if (iter->module_name[0]) {
+ char type;
+
+ /* Label it "global" if it is exported,
+ * "local" if not exported. */
+ type = iter->exported ? toupper(iter->type) :
+ tolower(iter->type);
seq_printf(m, "%0*lx %c %s\t[%s]\n",
(int)(2*sizeof(void*)),
- iter->value, iter->type, iter->name,
- module_name(iter->owner));
- else
+ iter->value, type, iter->name, iter->module_name);
+ } else
seq_printf(m, "%0*lx %c %s\n",
(int)(2*sizeof(void*)),
iter->value, iter->type, iter->name);
@@ -426,18 +470,11 @@ static int kallsyms_open(struct inode *inode, struct file *file)
return ret;
}
-static int kallsyms_release(struct inode *inode, struct file *file)
-{
- struct seq_file *m = (struct seq_file *)file->private_data;
- kfree(m->private);
- return seq_release(inode, file);
-}
-
static const struct file_operations kallsyms_operations = {
.open = kallsyms_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = kallsyms_release,
+ .release = seq_release_private,
};
static int __init kallsyms_init(void)
@@ -452,3 +489,4 @@ static int __init kallsyms_init(void)
__initcall(kallsyms_init);
EXPORT_SYMBOL(__print_symbol);
+EXPORT_SYMBOL_GPL(sprint_symbol);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 2a59c8a01ae..25db14b89e8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1118,8 +1118,8 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
memset(&prstatus, 0, sizeof(prstatus));
prstatus.pr_pid = current->pid;
elf_core_copy_regs(&prstatus.pr_reg, regs);
- buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus,
- sizeof(prstatus));
+ buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
final_note(buf);
}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 796276141e5..49cc4b9c1a8 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -23,7 +23,6 @@
#include <linux/syscalls.h>
#include <linux/unistd.h>
#include <linux/kmod.h>
-#include <linux/smp_lock.h>
#include <linux/slab.h>
#include <linux/mnt_namespace.h>
#include <linux/completion.h>
@@ -166,6 +165,12 @@ static int ____call_usermodehelper(void *data)
/* We can run anywhere, unlike our parent keventd(). */
set_cpus_allowed(current, CPU_MASK_ALL);
+ /*
+ * Our parent is keventd, which runs with elevated scheduling priority.
+ * Avoid propagating that into the userspace child.
+ */
+ set_user_nice(current, 0);
+
retval = -EPERM;
if (current->fs->root)
retval = kernel_execve(sub_info->path,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d25a9ada3f8..9e47d8c493f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,16 +35,19 @@
#include <linux/hash.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/stddef.h>
#include <linux/module.h>
#include <linux/moduleloader.h>
#include <linux/kallsyms.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
+#include <linux/kdebug.h>
+
#include <asm-generic/sections.h>
#include <asm/cacheflush.h>
#include <asm/errno.h>
-#include <asm/kdebug.h>
+#include <asm/uaccess.h>
#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
@@ -63,6 +66,9 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
static atomic_t kprobe_count;
+/* NOTE: change this value only with kprobe_mutex held */
+static bool kprobe_enabled;
+
DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -132,9 +138,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
struct kprobe_insn_page *kip;
struct hlist_node *pos;
- retry:
- hlist_for_each(pos, &kprobe_insn_pages) {
- kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+ retry:
+ hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
if (kip->nused < INSNS_PER_PAGE) {
int i;
for (i = 0; i < INSNS_PER_PAGE; i++) {
@@ -155,9 +160,8 @@ kprobe_opcode_t __kprobes *get_insn_slot(void)
}
/* All out of space. Need to allocate a new page. Use slot 0. */
kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL);
- if (!kip) {
+ if (!kip)
return NULL;
- }
/*
* Use module_alloc so this page is within +/- 2GB of where the
@@ -213,9 +217,8 @@ static int __kprobes collect_garbage_slots(void)
if (check_safety() != 0)
return -EAGAIN;
- hlist_for_each_safe(pos, next, &kprobe_insn_pages) {
+ hlist_for_each_entry_safe(kip, pos, next, &kprobe_insn_pages, hlist) {
int i;
- kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
if (kip->ngarbage == 0)
continue;
kip->ngarbage = 0; /* we will collect all garbages */
@@ -234,8 +237,7 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
struct kprobe_insn_page *kip;
struct hlist_node *pos;
- hlist_for_each(pos, &kprobe_insn_pages) {
- kip = hlist_entry(pos, struct kprobe_insn_page, hlist);
+ hlist_for_each_entry(kip, pos, &kprobe_insn_pages, hlist) {
if (kip->insns <= slot &&
slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) {
int i = (slot - kip->insns) / MAX_INSN_SIZE;
@@ -248,9 +250,9 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
break;
}
}
- if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) {
+
+ if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE)
collect_garbage_slots();
- }
}
#endif
@@ -316,7 +318,6 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
reset_kprobe_instance();
}
}
- return;
}
static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
@@ -362,46 +363,6 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
}
/* Called with kretprobe_lock held */
-struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
-{
- struct hlist_node *node;
- struct kretprobe_instance *ri;
- hlist_for_each_entry(ri, node, &rp->free_instances, uflist)
- return ri;
- return NULL;
-}
-
-/* Called with kretprobe_lock held */
-static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
- *rp)
-{
- struct hlist_node *node;
- struct kretprobe_instance *ri;
- hlist_for_each_entry(ri, node, &rp->used_instances, uflist)
- return ri;
- return NULL;
-}
-
-/* Called with kretprobe_lock held */
-void __kprobes add_rp_inst(struct kretprobe_instance *ri)
-{
- /*
- * Remove rp inst off the free list -
- * Add it back when probed function returns
- */
- hlist_del(&ri->uflist);
-
- /* Add rp inst onto table */
- INIT_HLIST_NODE(&ri->hlist);
- hlist_add_head(&ri->hlist,
- &kretprobe_inst_table[hash_ptr(ri->task, KPROBE_HASH_BITS)]);
-
- /* Also add this rp inst to the used list. */
- INIT_HLIST_NODE(&ri->uflist);
- hlist_add_head(&ri->uflist, &ri->rp->used_instances);
-}
-
-/* Called with kretprobe_lock held */
void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
struct hlist_head *head)
{
@@ -454,7 +415,9 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
static inline void free_rp_inst(struct kretprobe *rp)
{
struct kretprobe_instance *ri;
- while ((ri = get_free_rp_inst(rp)) != NULL) {
+ struct hlist_node *pos, *next;
+
+ hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) {
hlist_del(&ri->uflist);
kfree(ri);
}
@@ -535,8 +498,8 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
static int __kprobes in_kprobes_functions(unsigned long addr)
{
- if (addr >= (unsigned long)__kprobes_text_start
- && addr < (unsigned long)__kprobes_text_end)
+ if (addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end)
return -EINVAL;
return 0;
}
@@ -563,19 +526,24 @@ static int __kprobes __register_kprobe(struct kprobe *p,
return -EINVAL;
p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset);
- if ((!kernel_text_address((unsigned long) p->addr)) ||
- in_kprobes_functions((unsigned long) p->addr))
+ if (!kernel_text_address((unsigned long) p->addr) ||
+ in_kprobes_functions((unsigned long) p->addr))
return -EINVAL;
p->mod_refcounted = 0;
- /* Check are we probing a module */
- if ((probed_mod = module_text_address((unsigned long) p->addr))) {
+
+ /*
+ * Check if are we probing a module.
+ */
+ probed_mod = module_text_address((unsigned long) p->addr);
+ if (probed_mod) {
struct module *calling_mod = module_text_address(called_from);
- /* We must allow modules to probe themself and
- * in this case avoid incrementing the module refcount,
- * so as to allow unloading of self probing modules.
+ /*
+ * We must allow modules to probe themself and in this case
+ * avoid incrementing the module refcount, so as to allow
+ * unloading of self probing modules.
*/
- if (calling_mod && (calling_mod != probed_mod)) {
+ if (calling_mod && calling_mod != probed_mod) {
if (unlikely(!try_module_get(probed_mod)))
return -EINVAL;
p->mod_refcounted = 1;
@@ -593,19 +561,21 @@ static int __kprobes __register_kprobe(struct kprobe *p,
goto out;
}
- if ((ret = arch_prepare_kprobe(p)) != 0)
+ ret = arch_prepare_kprobe(p);
+ if (ret)
goto out;
INIT_HLIST_NODE(&p->hlist);
hlist_add_head_rcu(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
- if (atomic_add_return(1, &kprobe_count) == \
+ if (kprobe_enabled) {
+ if (atomic_add_return(1, &kprobe_count) == \
(ARCH_INACTIVE_KPROBE_COUNT + 1))
- register_page_fault_notifier(&kprobe_page_fault_nb);
-
- arch_arm_kprobe(p);
+ register_page_fault_notifier(&kprobe_page_fault_nb);
+ arch_arm_kprobe(p);
+ }
out:
mutex_unlock(&kprobe_mutex);
@@ -616,8 +586,7 @@ out:
int __kprobes register_kprobe(struct kprobe *p)
{
- return __register_kprobe(p,
- (unsigned long)__builtin_return_address(0));
+ return __register_kprobe(p, (unsigned long)__builtin_return_address(0));
}
void __kprobes unregister_kprobe(struct kprobe *p)
@@ -641,11 +610,16 @@ void __kprobes unregister_kprobe(struct kprobe *p)
return;
}
valid_p:
- if ((old_p == p) || ((old_p->pre_handler == aggr_pre_handler) &&
- (p->list.next == &old_p->list) &&
- (p->list.prev == &old_p->list))) {
- /* Only probe on the hash list */
- arch_disarm_kprobe(p);
+ if (old_p == p ||
+ (old_p->pre_handler == aggr_pre_handler &&
+ p->list.next == &old_p->list && p->list.prev == &old_p->list)) {
+ /*
+ * Only probe on the hash list. Disarm only if kprobes are
+ * enabled - otherwise, the breakpoint would already have
+ * been removed. We save on flushing icache.
+ */
+ if (kprobe_enabled)
+ arch_disarm_kprobe(p);
hlist_del_rcu(&old_p->hlist);
cleanup_p = 1;
} else {
@@ -656,9 +630,11 @@ valid_p:
mutex_unlock(&kprobe_mutex);
synchronize_sched();
- if (p->mod_refcounted &&
- (mod = module_text_address((unsigned long)p->addr)))
- module_put(mod);
+ if (p->mod_refcounted) {
+ mod = module_text_address((unsigned long)p->addr);
+ if (mod)
+ module_put(mod);
+ }
if (cleanup_p) {
if (p != old_p) {
@@ -729,7 +705,21 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
/*TODO: consider to only swap the RA after the last pre_handler fired */
spin_lock_irqsave(&kretprobe_lock, flags);
- arch_prepare_kretprobe(rp, regs);
+ if (!hlist_empty(&rp->free_instances)) {
+ struct kretprobe_instance *ri;
+
+ ri = hlist_entry(rp->free_instances.first,
+ struct kretprobe_instance, uflist);
+ ri->rp = rp;
+ ri->task = current;
+ arch_prepare_kretprobe(ri, regs);
+
+ /* XXX(hch): why is there no hlist_move_head? */
+ hlist_del(&ri->uflist);
+ hlist_add_head(&ri->uflist, &ri->rp->used_instances);
+ hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task));
+ } else
+ rp->nmissed++;
spin_unlock_irqrestore(&kretprobe_lock, flags);
return 0;
}
@@ -792,11 +782,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp)
{
unsigned long flags;
struct kretprobe_instance *ri;
+ struct hlist_node *pos, *next;
unregister_kprobe(&rp->kp);
+
/* No race here */
spin_lock_irqsave(&kretprobe_lock, flags);
- while ((ri = get_used_rp_inst(rp)) != NULL) {
+ hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) {
ri->rp = NULL;
hlist_del(&ri->uflist);
}
@@ -816,6 +808,9 @@ static int __init init_kprobes(void)
}
atomic_set(&kprobe_count, 0);
+ /* By default, kprobes are enabled */
+ kprobe_enabled = true;
+
err = arch_init_kprobes();
if (!err)
err = register_die_notifier(&kprobe_exceptions_nb);
@@ -825,7 +820,7 @@ static int __init init_kprobes(void)
#ifdef CONFIG_DEBUG_FS
static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
- const char *sym, int offset,char *modname)
+ const char *sym, int offset,char *modname)
{
char *kprobe_type;
@@ -867,13 +862,13 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
struct kprobe *p, *kp;
const char *sym = NULL;
unsigned int i = *(loff_t *) v;
- unsigned long size, offset = 0;
+ unsigned long offset = 0;
char *modname, namebuf[128];
head = &kprobe_table[i];
preempt_disable();
hlist_for_each_entry_rcu(p, node, head, hlist) {
- sym = kallsyms_lookup((unsigned long)p->addr, &size,
+ sym = kallsyms_lookup((unsigned long)p->addr, NULL,
&offset, &modname, namebuf);
if (p->pre_handler == aggr_pre_handler) {
list_for_each_entry_rcu(kp, &p->list, list)
@@ -904,21 +899,149 @@ static struct file_operations debugfs_kprobes_operations = {
.release = seq_release,
};
+static void __kprobes enable_all_kprobes(void)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kprobe *p;
+ unsigned int i;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* If kprobes are already enabled, just return */
+ if (kprobe_enabled)
+ goto already_enabled;
+
+ /*
+ * Re-register the page fault notifier only if there are any
+ * active probes at the time of enabling kprobes globally
+ */
+ if (atomic_read(&kprobe_count) > ARCH_INACTIVE_KPROBE_COUNT)
+ register_page_fault_notifier(&kprobe_page_fault_nb);
+
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry_rcu(p, node, head, hlist)
+ arch_arm_kprobe(p);
+ }
+
+ kprobe_enabled = true;
+ printk(KERN_INFO "Kprobes globally enabled\n");
+
+already_enabled:
+ mutex_unlock(&kprobe_mutex);
+ return;
+}
+
+static void __kprobes disable_all_kprobes(void)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kprobe *p;
+ unsigned int i;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* If kprobes are already disabled, just return */
+ if (!kprobe_enabled)
+ goto already_disabled;
+
+ kprobe_enabled = false;
+ printk(KERN_INFO "Kprobes globally disabled\n");
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry_rcu(p, node, head, hlist) {
+ if (!arch_trampoline_kprobe(p))
+ arch_disarm_kprobe(p);
+ }
+ }
+
+ mutex_unlock(&kprobe_mutex);
+ /* Allow all currently running kprobes to complete */
+ synchronize_sched();
+
+ mutex_lock(&kprobe_mutex);
+ /* Unconditionally unregister the page_fault notifier */
+ unregister_page_fault_notifier(&kprobe_page_fault_nb);
+
+already_disabled:
+ mutex_unlock(&kprobe_mutex);
+ return;
+}
+
+/*
+ * XXX: The debugfs bool file interface doesn't allow for callbacks
+ * when the bool state is switched. We can reuse that facility when
+ * available
+ */
+static ssize_t read_enabled_file_bool(struct file *file,
+ char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[3];
+
+ if (kprobe_enabled)
+ buf[0] = '1';
+ else
+ buf[0] = '0';
+ buf[1] = '\n';
+ buf[2] = 0x00;
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t write_enabled_file_bool(struct file *file,
+ const char __user *user_buf, size_t count, loff_t *ppos)
+{
+ char buf[32];
+ int buf_size;
+
+ buf_size = min(count, (sizeof(buf)-1));
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ switch (buf[0]) {
+ case 'y':
+ case 'Y':
+ case '1':
+ enable_all_kprobes();
+ break;
+ case 'n':
+ case 'N':
+ case '0':
+ disable_all_kprobes();
+ break;
+ }
+
+ return count;
+}
+
+static struct file_operations fops_kp = {
+ .read = read_enabled_file_bool,
+ .write = write_enabled_file_bool,
+};
+
static int __kprobes debugfs_kprobe_init(void)
{
struct dentry *dir, *file;
+ unsigned int value = 1;
dir = debugfs_create_dir("kprobes", NULL);
if (!dir)
return -ENOMEM;
- file = debugfs_create_file("list", 0444, dir , 0 ,
+ file = debugfs_create_file("list", 0444, dir, NULL,
&debugfs_kprobes_operations);
if (!file) {
debugfs_remove(dir);
return -ENOMEM;
}
+ file = debugfs_create_file("enabled", 0600, dir,
+ &value, &fops_kp);
+ if (!file) {
+ debugfs_remove(dir);
+ return -ENOMEM;
+ }
+
return 0;
}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e0ffe4ab091..559deca5ed1 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -24,18 +24,18 @@ static struct subsys_attribute _name##_attr = \
#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
/* current uevent sequence number */
-static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page)
+static ssize_t uevent_seqnum_show(struct kset *kset, char *page)
{
return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum);
}
KERNEL_ATTR_RO(uevent_seqnum);
/* uevent helper program, used during early boo */
-static ssize_t uevent_helper_show(struct subsystem *subsys, char *page)
+static ssize_t uevent_helper_show(struct kset *kset, char *page)
{
return sprintf(page, "%s\n", uevent_helper);
}
-static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count)
+static ssize_t uevent_helper_store(struct kset *kset, const char *page, size_t count)
{
if (count+1 > UEVENT_HELPER_PATH_LEN)
return -ENOENT;
@@ -49,13 +49,13 @@ KERNEL_ATTR_RW(uevent_helper);
#endif
#ifdef CONFIG_KEXEC
-static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page)
+static ssize_t kexec_loaded_show(struct kset *kset, char *page)
{
return sprintf(page, "%d\n", !!kexec_image);
}
KERNEL_ATTR_RO(kexec_loaded);
-static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page)
+static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page)
{
return sprintf(page, "%d\n", !!kexec_crash_image);
}
@@ -85,7 +85,7 @@ static int __init ksysfs_init(void)
{
int error = subsystem_register(&kernel_subsys);
if (!error)
- error = sysfs_create_group(&kernel_subsys.kset.kobj,
+ error = sysfs_create_group(&kernel_subsys.kobj,
&kernel_attr_group);
return error;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8dc24c92dc6..1a5ff2211d8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -257,9 +257,8 @@ static int save_trace(struct stack_trace *trace)
trace->entries = stack_trace + nr_stack_trace_entries;
trace->skip = 3;
- trace->all_contexts = 0;
- save_stack_trace(trace, NULL);
+ save_stack_trace(trace);
trace->max_entries = trace->nr_entries;
@@ -341,10 +340,7 @@ static const char *usage_str[] =
const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
{
- unsigned long offs, size;
- char *modname;
-
- return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str);
+ return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
}
void
@@ -1313,8 +1309,9 @@ out_unlock_set:
/*
* Look up a dependency chain. If the key is not present yet then
- * add it and return 0 - in this case the new dependency chain is
- * validated. If the key is already hashed, return 1.
+ * add it and return 1 - in this case the new dependency chain is
+ * validated. If the key is already hashed, return 0.
+ * (On return with 1 graph_lock is held.)
*/
static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class)
{
@@ -1577,7 +1574,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
* Mark a lock with a usage bit, and validate the state transition:
*/
static int mark_lock(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit new_bit, unsigned long ip)
+ enum lock_usage_bit new_bit)
{
unsigned int new_mask = 1 << new_bit, ret = 1;
@@ -1600,14 +1597,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
this->class->usage_mask |= new_mask;
-#ifdef CONFIG_TRACE_IRQFLAGS
- if (new_bit == LOCK_ENABLED_HARDIRQS ||
- new_bit == LOCK_ENABLED_HARDIRQS_READ)
- ip = curr->hardirq_enable_ip;
- else if (new_bit == LOCK_ENABLED_SOFTIRQS ||
- new_bit == LOCK_ENABLED_SOFTIRQS_READ)
- ip = curr->softirq_enable_ip;
-#endif
if (!save_trace(this->class->usage_traces + new_bit))
return 0;
@@ -1806,7 +1795,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
* Mark all held locks with a usage bit:
*/
static int
-mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
+mark_held_locks(struct task_struct *curr, int hardirq)
{
enum lock_usage_bit usage_bit;
struct held_lock *hlock;
@@ -1826,7 +1815,7 @@ mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
else
usage_bit = LOCK_ENABLED_SOFTIRQS;
}
- if (!mark_lock(curr, hlock, usage_bit, ip))
+ if (!mark_lock(curr, hlock, usage_bit))
return 0;
}
@@ -1879,7 +1868,7 @@ void trace_hardirqs_on(void)
* We are going to turn hardirqs on, so set the
* usage bit for all held locks:
*/
- if (!mark_held_locks(curr, 1, ip))
+ if (!mark_held_locks(curr, 1))
return;
/*
* If we have softirqs enabled, then set the usage
@@ -1887,7 +1876,7 @@ void trace_hardirqs_on(void)
* this bit from being set before)
*/
if (curr->softirqs_enabled)
- if (!mark_held_locks(curr, 0, ip))
+ if (!mark_held_locks(curr, 0))
return;
curr->hardirq_enable_ip = ip;
@@ -1955,7 +1944,7 @@ void trace_softirqs_on(unsigned long ip)
* enabled too:
*/
if (curr->hardirqs_enabled)
- mark_held_locks(curr, 0, ip);
+ mark_held_locks(curr, 0);
}
/*
@@ -2093,43 +2082,43 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (read) {
if (curr->hardirq_context)
if (!mark_lock(curr, hlock,
- LOCK_USED_IN_HARDIRQ_READ, ip))
+ LOCK_USED_IN_HARDIRQ_READ))
return 0;
if (curr->softirq_context)
if (!mark_lock(curr, hlock,
- LOCK_USED_IN_SOFTIRQ_READ, ip))
+ LOCK_USED_IN_SOFTIRQ_READ))
return 0;
} else {
if (curr->hardirq_context)
- if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip))
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
return 0;
if (curr->softirq_context)
- if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip))
+ if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ))
return 0;
}
}
if (!hardirqs_off) {
if (read) {
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_HARDIRQS_READ, ip))
+ LOCK_ENABLED_HARDIRQS_READ))
return 0;
if (curr->softirqs_enabled)
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_SOFTIRQS_READ, ip))
+ LOCK_ENABLED_SOFTIRQS_READ))
return 0;
} else {
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_HARDIRQS, ip))
+ LOCK_ENABLED_HARDIRQS))
return 0;
if (curr->softirqs_enabled)
if (!mark_lock(curr, hlock,
- LOCK_ENABLED_SOFTIRQS, ip))
+ LOCK_ENABLED_SOFTIRQS))
return 0;
}
}
#endif
/* mark it as used: */
- if (!mark_lock(curr, hlock, LOCK_USED, ip))
+ if (!mark_lock(curr, hlock, LOCK_USED))
return 0;
out_calc_hash:
/*
@@ -2742,6 +2731,10 @@ void debug_show_all_locks(void)
int count = 10;
int unlock = 1;
+ if (unlikely(!debug_locks)) {
+ printk("INFO: lockdep is turned off.\n");
+ return;
+ }
printk("\nShowing all locks held in the system:\n");
/*
@@ -2785,6 +2778,10 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks);
void debug_show_held_locks(struct task_struct *task)
{
+ if (unlikely(!debug_locks)) {
+ printk("INFO: lockdep is turned off.\n");
+ return;
+ }
lockdep_print_held_locks(task);
}
diff --git a/kernel/module.c b/kernel/module.c
index f77e893e462..d36e45477fa 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -19,6 +19,7 @@
#include <linux/module.h>
#include <linux/moduleloader.h>
#include <linux/init.h>
+#include <linux/kallsyms.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
@@ -45,6 +46,8 @@
#include <asm/cacheflush.h>
#include <linux/license.h>
+extern int module_sysfs_initialized;
+
#if 0
#define DEBUGP printk
#else
@@ -308,14 +311,14 @@ static int split_block(unsigned int i, unsigned short size)
{
/* Reallocation required? */
if (pcpu_num_used + 1 > pcpu_num_allocated) {
- int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2,
- GFP_KERNEL);
+ int *new;
+
+ new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
+ GFP_KERNEL);
if (!new)
return 0;
- memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated);
pcpu_num_allocated *= 2;
- kfree(pcpu_size);
pcpu_size = new;
}
@@ -346,10 +349,10 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
unsigned int i;
void *ptr;
- if (align > SMP_CACHE_BYTES) {
- printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n",
- name, align, SMP_CACHE_BYTES);
- align = SMP_CACHE_BYTES;
+ if (align > PAGE_SIZE) {
+ printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+ name, align, PAGE_SIZE);
+ align = PAGE_SIZE;
}
ptr = __per_cpu_start;
@@ -430,7 +433,7 @@ static int percpu_modinit(void)
pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
GFP_KERNEL);
/* Static in-kernel percpu data (used). */
- pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES);
+ pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
/* Free room. */
pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
if (pcpu_size[1] < 0) {
@@ -1117,8 +1120,8 @@ int mod_sysfs_init(struct module *mod)
{
int err;
- if (!module_subsys.kset.subsys) {
- printk(KERN_ERR "%s: module_subsys not initialized\n",
+ if (!module_sysfs_initialized) {
+ printk(KERN_ERR "%s: module sysfs not initialized\n",
mod->name);
err = -EINVAL;
goto out;
@@ -1148,8 +1151,10 @@ int mod_sysfs_setup(struct module *mod,
goto out;
mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders");
- if (!mod->holders_dir)
+ if (!mod->holders_dir) {
+ err = -ENOMEM;
goto out_unreg;
+ }
err = module_param_sysfs_setup(mod, kparam, num_params);
if (err)
@@ -1467,7 +1472,7 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
}
#ifdef CONFIG_KALLSYMS
-int is_exported(const char *name, const struct module *mod)
+static int is_exported(const char *name, const struct module *mod)
{
if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
return 1;
@@ -2093,8 +2098,10 @@ static const char *get_ksymbol(struct module *mod,
if (!best)
return NULL;
- *size = nextval - mod->symtab[best].st_value;
- *offset = addr - mod->symtab[best].st_value;
+ if (size)
+ *size = nextval - mod->symtab[best].st_value;
+ if (offset)
+ *offset = addr - mod->symtab[best].st_value;
return mod->strtab + mod->symtab[best].st_name;
}
@@ -2119,8 +2126,58 @@ const char *module_address_lookup(unsigned long addr,
return NULL;
}
-struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
- char *type, char *name, size_t namelen)
+int lookup_module_symbol_name(unsigned long addr, char *symname)
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list) {
+ if (within(addr, mod->module_init, mod->init_size) ||
+ within(addr, mod->module_core, mod->core_size)) {
+ const char *sym;
+
+ sym = get_ksymbol(mod, addr, NULL, NULL);
+ if (!sym)
+ goto out;
+ strlcpy(symname, sym, KSYM_NAME_LEN + 1);
+ mutex_unlock(&module_mutex);
+ return 0;
+ }
+ }
+out:
+ mutex_unlock(&module_mutex);
+ return -ERANGE;
+}
+
+int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
+ unsigned long *offset, char *modname, char *name)
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list) {
+ if (within(addr, mod->module_init, mod->init_size) ||
+ within(addr, mod->module_core, mod->core_size)) {
+ const char *sym;
+
+ sym = get_ksymbol(mod, addr, size, offset);
+ if (!sym)
+ goto out;
+ if (modname)
+ strlcpy(modname, mod->name, MODULE_NAME_LEN + 1);
+ if (name)
+ strlcpy(name, sym, KSYM_NAME_LEN + 1);
+ mutex_unlock(&module_mutex);
+ return 0;
+ }
+ }
+out:
+ mutex_unlock(&module_mutex);
+ return -ERANGE;
+}
+
+int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ char *name, char *module_name, int *exported)
{
struct module *mod;
@@ -2130,14 +2187,16 @@ struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
*value = mod->symtab[symnum].st_value;
*type = mod->symtab[symnum].st_info;
strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
- namelen);
+ KSYM_NAME_LEN + 1);
+ strlcpy(module_name, mod->name, MODULE_NAME_LEN + 1);
+ *exported = is_exported(name, mod);
mutex_unlock(&module_mutex);
- return mod;
+ return 0;
}
symnum -= mod->num_symtab;
}
mutex_unlock(&module_mutex);
- return NULL;
+ return -ERANGE;
}
static unsigned long mod_find_symname(struct module *mod, const char *name)
@@ -2383,9 +2442,14 @@ void module_add_driver(struct module *mod, struct device_driver *drv)
struct kobject *mkobj;
/* Lookup built-in module entry in /sys/modules */
- mkobj = kset_find_obj(&module_subsys.kset, drv->mod_name);
- if (mkobj)
+ mkobj = kset_find_obj(&module_subsys, drv->mod_name);
+ if (mkobj) {
mk = container_of(mkobj, struct module_kobject, kobj);
+ /* remember our module structure */
+ drv->mkobj = mk;
+ /* kset_find_obj took a reference */
+ kobject_put(mkobj);
+ }
}
if (!mk)
@@ -2405,26 +2469,25 @@ EXPORT_SYMBOL(module_add_driver);
void module_remove_driver(struct device_driver *drv)
{
+ struct module_kobject *mk = NULL;
char *driver_name;
if (!drv)
return;
sysfs_remove_link(&drv->kobj, "module");
- if (drv->owner && drv->owner->mkobj.drivers_dir) {
+
+ if (drv->owner)
+ mk = &drv->owner->mkobj;
+ else if (drv->mkobj)
+ mk = drv->mkobj;
+ if (mk && mk->drivers_dir) {
driver_name = make_driver_name(drv);
if (driver_name) {
- sysfs_remove_link(drv->owner->mkobj.drivers_dir,
- driver_name);
+ sysfs_remove_link(mk->drivers_dir, driver_name);
kfree(driver_name);
}
}
- /*
- * Undo the additional reference we added in module_add_driver()
- * via kset_find_obj()
- */
- if (drv->mod_name)
- kobject_put(&drv->kobj);
}
EXPORT_SYMBOL(module_remove_driver);
#endif
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f5b9ee6f6bb..1bc4b55241a 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -38,10 +38,8 @@ void get_task_namespaces(struct task_struct *tsk)
/*
* creates a copy of "orig" with refcount 1.
- * This does not grab references to the contained namespaces,
- * so that needs to be done by dup_namespaces.
*/
-static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
+static inline struct nsproxy *clone_nsproxy(struct nsproxy *orig)
{
struct nsproxy *ns;
@@ -52,26 +50,49 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig)
}
/*
- * copies the nsproxy, setting refcount to 1, and grabbing a
- * reference to all contained namespaces. Called from
- * sys_unshare()
+ * Create new nsproxy and all of its the associated namespaces.
+ * Return the newly created nsproxy. Do not attach this to the task,
+ * leave it to the caller to do proper locking and attach it to task.
*/
-struct nsproxy *dup_namespaces(struct nsproxy *orig)
+static struct nsproxy *create_new_namespaces(int flags, struct task_struct *tsk,
+ struct fs_struct *new_fs)
{
- struct nsproxy *ns = clone_namespaces(orig);
+ struct nsproxy *new_nsp;
- if (ns) {
- if (ns->mnt_ns)
- get_mnt_ns(ns->mnt_ns);
- if (ns->uts_ns)
- get_uts_ns(ns->uts_ns);
- if (ns->ipc_ns)
- get_ipc_ns(ns->ipc_ns);
- if (ns->pid_ns)
- get_pid_ns(ns->pid_ns);
- }
+ new_nsp = clone_nsproxy(tsk->nsproxy);
+ if (!new_nsp)
+ return ERR_PTR(-ENOMEM);
- return ns;
+ new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
+ if (IS_ERR(new_nsp->mnt_ns))
+ goto out_ns;
+
+ new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
+ if (IS_ERR(new_nsp->uts_ns))
+ goto out_uts;
+
+ new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
+ if (IS_ERR(new_nsp->ipc_ns))
+ goto out_ipc;
+
+ new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns);
+ if (IS_ERR(new_nsp->pid_ns))
+ goto out_pid;
+
+ return new_nsp;
+
+out_pid:
+ if (new_nsp->ipc_ns)
+ put_ipc_ns(new_nsp->ipc_ns);
+out_ipc:
+ if (new_nsp->uts_ns)
+ put_uts_ns(new_nsp->uts_ns);
+out_uts:
+ if (new_nsp->mnt_ns)
+ put_mnt_ns(new_nsp->mnt_ns);
+out_ns:
+ kfree(new_nsp);
+ return ERR_PTR(-ENOMEM);
}
/*
@@ -92,47 +113,21 @@ int copy_namespaces(int flags, struct task_struct *tsk)
if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
return 0;
- new_ns = clone_namespaces(old_ns);
- if (!new_ns) {
- err = -ENOMEM;
+ if (!capable(CAP_SYS_ADMIN)) {
+ err = -EPERM;
goto out;
}
- tsk->nsproxy = new_ns;
-
- err = copy_mnt_ns(flags, tsk);
- if (err)
- goto out_ns;
-
- err = copy_utsname(flags, tsk);
- if (err)
- goto out_uts;
-
- err = copy_ipcs(flags, tsk);
- if (err)
- goto out_ipc;
-
- err = copy_pid_ns(flags, tsk);
- if (err)
- goto out_pid;
+ new_ns = create_new_namespaces(flags, tsk, tsk->fs);
+ if (IS_ERR(new_ns)) {
+ err = PTR_ERR(new_ns);
+ goto out;
+ }
+ tsk->nsproxy = new_ns;
out:
put_nsproxy(old_ns);
return err;
-
-out_pid:
- if (new_ns->ipc_ns)
- put_ipc_ns(new_ns->ipc_ns);
-out_ipc:
- if (new_ns->uts_ns)
- put_uts_ns(new_ns->uts_ns);
-out_uts:
- if (new_ns->mnt_ns)
- put_mnt_ns(new_ns->mnt_ns);
-out_ns:
- tsk->nsproxy = old_ns;
- kfree(new_ns);
- goto out;
}
void free_nsproxy(struct nsproxy *ns)
@@ -147,3 +142,41 @@ void free_nsproxy(struct nsproxy *ns)
put_pid_ns(ns->pid_ns);
kfree(ns);
}
+
+/*
+ * Called from unshare. Unshare all the namespaces part of nsproxy.
+ * On sucess, returns the new nsproxy and a reference to old nsproxy
+ * to make sure it stays around.
+ */
+int unshare_nsproxy_namespaces(unsigned long unshare_flags,
+ struct nsproxy **new_nsp, struct fs_struct *new_fs)
+{
+ struct nsproxy *old_ns = current->nsproxy;
+ int err = 0;
+
+ if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
+ return 0;
+
+#ifndef CONFIG_IPC_NS
+ if (unshare_flags & CLONE_NEWIPC)
+ return -EINVAL;
+#endif
+
+#ifndef CONFIG_UTS_NS
+ if (unshare_flags & CLONE_NEWUTS)
+ return -EINVAL;
+#endif
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ get_nsproxy(old_ns);
+
+ *new_nsp = create_new_namespaces(unshare_flags, current,
+ new_fs ? new_fs : current->fs);
+ if (IS_ERR(*new_nsp)) {
+ err = PTR_ERR(*new_nsp);
+ put_nsproxy(old_ns);
+ }
+ return err;
+}
diff --git a/kernel/params.c b/kernel/params.c
index e265b13195b..e61c46c97ce 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -269,7 +269,7 @@ int param_get_invbool(char *buffer, struct kernel_param *kp)
return param_get_bool(buffer, &dummy);
}
-/* We cheat here and temporarily mangle the string. */
+/* We break the rule and mangle the string. */
static int param_array(const char *name,
const char *val,
unsigned int min, unsigned int max,
@@ -356,6 +356,10 @@ int param_set_copystring(const char *val, struct kernel_param *kp)
{
struct kparam_string *kps = kp->arg;
+ if (!val) {
+ printk(KERN_ERR "%s: missing param set value\n", kp->name);
+ return -EINVAL;
+ }
if (strlen(val)+1 > kps->maxlen) {
printk(KERN_ERR "%s: string doesn't fit in %u chars.\n",
kp->name, kps->maxlen-1);
@@ -687,6 +691,7 @@ static struct kset_uevent_ops module_uevent_ops = {
};
decl_subsys(module, &module_ktype, &module_uevent_ops);
+int module_sysfs_initialized;
static struct kobj_type module_ktype = {
.sysfs_ops = &module_sysfs_ops,
@@ -705,6 +710,7 @@ static int __init param_sysfs_init(void)
__FILE__, __LINE__, ret);
return ret;
}
+ module_sysfs_initialized = 1;
param_sysfs_builtin();
diff --git a/kernel/pid.c b/kernel/pid.c
index 78f2aee90f5..d3ad724afa8 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -360,16 +360,11 @@ struct pid *find_ge_pid(int nr)
}
EXPORT_SYMBOL_GPL(find_get_pid);
-int copy_pid_ns(int flags, struct task_struct *tsk)
+struct pid_namespace *copy_pid_ns(int flags, struct pid_namespace *old_ns)
{
- struct pid_namespace *old_ns = tsk->nsproxy->pid_ns;
- int err = 0;
-
- if (!old_ns)
- return 0;
-
+ BUG_ON(!old_ns);
get_pid_ns(old_ns);
- return err;
+ return old_ns;
}
void free_pid_ns(struct kref *kref)
@@ -412,7 +407,5 @@ void __init pidmap_init(void)
set_bit(0, init_pid_ns.pidmap[0].page);
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
- pid_cachep = kmem_cache_create("pid", sizeof(struct pid),
- __alignof__(struct pid),
- SLAB_PANIC, NULL, NULL);
+ pid_cachep = KMEM_CACHE(pid, SLAB_PANIC);
}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 657f7769741..1de710e1837 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -971,7 +971,7 @@ static void check_thread_timers(struct task_struct *tsk,
maxfire = 20;
tsk->it_prof_expires = cputime_zero;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
@@ -986,7 +986,7 @@ static void check_thread_timers(struct task_struct *tsk,
maxfire = 20;
tsk->it_virt_expires = cputime_zero;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
@@ -1001,7 +1001,7 @@ static void check_thread_timers(struct task_struct *tsk,
maxfire = 20;
tsk->it_sched_expires = 0;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || tsk->sched_time < t->expires.sched) {
@@ -1057,7 +1057,7 @@ static void check_process_timers(struct task_struct *tsk,
maxfire = 20;
prof_expires = cputime_zero;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
@@ -1072,7 +1072,7 @@ static void check_process_timers(struct task_struct *tsk,
maxfire = 20;
virt_expires = cputime_zero;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
@@ -1087,7 +1087,7 @@ static void check_process_timers(struct task_struct *tsk,
maxfire = 20;
sched_expires = 0;
while (!list_empty(timers)) {
- struct cpu_timer_list *t = list_entry(timers->next,
+ struct cpu_timer_list *t = list_first_entry(timers,
struct cpu_timer_list,
entry);
if (!--maxfire || sched_time < t->expires.sched) {
@@ -1400,7 +1400,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
*/
head = &tsk->signal->cpu_timers[clock_idx];
if (list_empty(head) ||
- cputime_ge(list_entry(head->next,
+ cputime_ge(list_first_entry(head,
struct cpu_timer_list, entry)->expires.cpu,
*newval)) {
/*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 44318ca7197..588c99da030 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -31,7 +31,6 @@
* POSIX clocks & timers
*/
#include <linux/mm.h>
-#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/time.h>
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 51a4dd0f1b7..877721708fa 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -78,17 +78,22 @@ config PM_SYSFS_DEPRECATED
are likely to be bus or driver specific.
config SOFTWARE_SUSPEND
- bool "Software Suspend"
+ bool "Software Suspend (Hibernation)"
depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP))
---help---
- Enable the suspend to disk (STD) functionality.
+ Enable the suspend to disk (STD) functionality, which is usually
+ called "hibernation" in user interfaces. STD checkpoints the
+ system and powers it off; and restores that checkpoint on reboot.
You can suspend your machine with 'echo disk > /sys/power/state'.
Alternatively, you can use the additional userland tools available
from <http://suspend.sf.net>.
In principle it does not require ACPI or APM, although for example
- ACPI will be used if available.
+ ACPI will be used for the final steps when it is available. One
+ of the reasons to use software suspend is that the firmware hooks
+ for suspend states like suspend-to-RAM (STR) often don't work very
+ well with Linux.
It creates an image which is saved in your active swap. Upon the next
boot, pass the 'resume=/dev/swappartition' argument to the kernel to
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 623786d4415..89bcf4973ee 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -27,7 +27,15 @@ int pm_prepare_console(void)
return 1;
}
- set_console(SUSPEND_CONSOLE);
+ if (set_console(SUSPEND_CONSOLE)) {
+ /*
+ * We're unable to switch to the SUSPEND_CONSOLE.
+ * Let the calling function know so it can decide
+ * what to do.
+ */
+ release_console_sem();
+ return 1;
+ }
release_console_sem();
if (vt_waitactive(SUSPEND_CONSOLE)) {
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 406b20adb27..06331374d86 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -39,7 +39,13 @@ static inline int platform_prepare(void)
{
int error = 0;
- if (pm_disk_mode == PM_DISK_PLATFORM) {
+ switch (pm_disk_mode) {
+ case PM_DISK_TEST:
+ case PM_DISK_TESTPROC:
+ case PM_DISK_SHUTDOWN:
+ case PM_DISK_REBOOT:
+ break;
+ default:
if (pm_ops && pm_ops->prepare)
error = pm_ops->prepare(PM_SUSPEND_DISK);
}
@@ -48,40 +54,48 @@ static inline int platform_prepare(void)
/**
* power_down - Shut machine down for hibernate.
- * @mode: Suspend-to-disk mode
*
- * Use the platform driver, if configured so, and return gracefully if it
- * fails.
- * Otherwise, try to power off and reboot. If they fail, halt the machine,
- * there ain't no turning back.
+ * Use the platform driver, if configured so; otherwise try
+ * to power off or reboot.
*/
-static void power_down(suspend_disk_method_t mode)
+static void power_down(void)
{
- switch(mode) {
- case PM_DISK_PLATFORM:
- if (pm_ops && pm_ops->enter) {
- kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
- pm_ops->enter(PM_SUSPEND_DISK);
- break;
- }
+ switch (pm_disk_mode) {
+ case PM_DISK_TEST:
+ case PM_DISK_TESTPROC:
+ break;
case PM_DISK_SHUTDOWN:
kernel_power_off();
break;
case PM_DISK_REBOOT:
kernel_restart(NULL);
break;
+ default:
+ if (pm_ops && pm_ops->enter) {
+ kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
+ pm_ops->enter(PM_SUSPEND_DISK);
+ break;
+ }
}
kernel_halt();
- /* Valid image is on the disk, if we continue we risk serious data corruption
- after resume. */
+ /*
+ * Valid image is on the disk, if we continue we risk serious data
+ * corruption after resume.
+ */
printk(KERN_CRIT "Please power me down manually\n");
while(1);
}
static inline void platform_finish(void)
{
- if (pm_disk_mode == PM_DISK_PLATFORM) {
+ switch (pm_disk_mode) {
+ case PM_DISK_TEST:
+ case PM_DISK_TESTPROC:
+ case PM_DISK_SHUTDOWN:
+ case PM_DISK_REBOOT:
+ break;
+ default:
if (pm_ops && pm_ops->finish)
pm_ops->finish(PM_SUSPEND_DISK);
}
@@ -108,8 +122,6 @@ static int prepare_processes(void)
/**
* pm_suspend_disk - The granpappy of hibernation power management.
*
- * If we're going through the firmware, then get it over with quickly.
- *
* If not, then call swsusp to do its thing, then figure out how
* to power down the system.
*/
@@ -118,15 +130,25 @@ int pm_suspend_disk(void)
{
int error;
+ /* The snapshot device should not be opened while we're running */
+ if (!atomic_add_unless(&snapshot_device_available, -1, 0))
+ return -EBUSY;
+
+ /* Allocate memory management structures */
+ error = create_basic_memory_bitmaps();
+ if (error)
+ goto Exit;
+
error = prepare_processes();
if (error)
- return error;
+ goto Finish;
if (pm_disk_mode == PM_DISK_TESTPROC) {
printk("swsusp debug: Waiting for 5 seconds.\n");
mdelay(5000);
goto Thaw;
}
+
/* Free memory before shutting down devices. */
error = swsusp_shrink_memory();
if (error)
@@ -166,7 +188,7 @@ int pm_suspend_disk(void)
pr_debug("PM: writing image.\n");
error = swsusp_write();
if (!error)
- power_down(pm_disk_mode);
+ power_down();
else {
swsusp_free();
goto Thaw;
@@ -184,6 +206,10 @@ int pm_suspend_disk(void)
resume_console();
Thaw:
unprepare_processes();
+ Finish:
+ free_basic_memory_bitmaps();
+ Exit:
+ atomic_inc(&snapshot_device_available);
return error;
}
@@ -227,25 +253,27 @@ static int software_resume(void)
}
pr_debug("PM: Checking swsusp image.\n");
-
error = swsusp_check();
if (error)
- goto Done;
+ goto Unlock;
- pr_debug("PM: Preparing processes for restore.\n");
+ /* The snapshot device should not be opened while we're running */
+ if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+ error = -EBUSY;
+ goto Unlock;
+ }
+ error = create_basic_memory_bitmaps();
+ if (error)
+ goto Finish;
+
+ pr_debug("PM: Preparing processes for restore.\n");
error = prepare_processes();
if (error) {
swsusp_close();
goto Done;
}
- error = platform_prepare();
- if (error) {
- swsusp_free();
- goto Thaw;
- }
-
pr_debug("PM: Reading swsusp image.\n");
error = swsusp_read();
@@ -268,14 +296,17 @@ static int software_resume(void)
enable_nonboot_cpus();
Free:
swsusp_free();
- platform_finish();
device_resume();
resume_console();
Thaw:
printk(KERN_ERR "PM: Restore failed, recovering.\n");
unprepare_processes();
Done:
+ free_basic_memory_bitmaps();
+ Finish:
+ atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
+ Unlock:
mutex_unlock(&pm_mutex);
pr_debug("PM: Resume from disk failed.\n");
return 0;
@@ -285,7 +316,6 @@ late_initcall(software_resume);
static const char * const pm_disk_modes[] = {
- [PM_DISK_FIRMWARE] = "firmware",
[PM_DISK_PLATFORM] = "platform",
[PM_DISK_SHUTDOWN] = "shutdown",
[PM_DISK_REBOOT] = "reboot",
@@ -296,37 +326,62 @@ static const char * const pm_disk_modes[] = {
/**
* disk - Control suspend-to-disk mode
*
- * Suspend-to-disk can be handled in several ways. The greatest
- * distinction is who writes memory to disk - the firmware or the OS.
- * If the firmware does it, we assume that it also handles suspending
- * the system.
- * If the OS does it, then we have three options for putting the system
- * to sleep - using the platform driver (e.g. ACPI or other PM registers),
- * powering off the system or rebooting the system (for testing).
+ * Suspend-to-disk can be handled in several ways. We have a few options
+ * for putting the system to sleep - using the platform driver (e.g. ACPI
+ * or other pm_ops), powering off the system or rebooting the system
+ * (for testing) as well as the two test modes.
*
- * The system will support either 'firmware' or 'platform', and that is
- * known a priori (and encoded in pm_ops). But, the user may choose
- * 'shutdown' or 'reboot' as alternatives.
+ * The system can support 'platform', and that is known a priori (and
+ * encoded in pm_ops). However, the user may choose 'shutdown' or 'reboot'
+ * as alternatives, as well as the test modes 'test' and 'testproc'.
*
* show() will display what the mode is currently set to.
* store() will accept one of
*
- * 'firmware'
* 'platform'
* 'shutdown'
* 'reboot'
+ * 'test'
+ * 'testproc'
*
- * It will only change to 'firmware' or 'platform' if the system
+ * It will only change to 'platform' if the system
* supports it (as determined from pm_ops->pm_disk_mode).
*/
-static ssize_t disk_show(struct subsystem * subsys, char * buf)
+static ssize_t disk_show(struct kset *kset, char *buf)
{
- return sprintf(buf, "%s\n", pm_disk_modes[pm_disk_mode]);
+ int i;
+ char *start = buf;
+
+ for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
+ if (!pm_disk_modes[i])
+ continue;
+ switch (i) {
+ case PM_DISK_SHUTDOWN:
+ case PM_DISK_REBOOT:
+ case PM_DISK_TEST:
+ case PM_DISK_TESTPROC:
+ break;
+ default:
+ if (pm_ops && pm_ops->enter &&
+ (i == pm_ops->pm_disk_mode))
+ break;
+ /* not a valid mode, continue with loop */
+ continue;
+ }
+ if (i == pm_disk_mode)
+ buf += sprintf(buf, "[%s]", pm_disk_modes[i]);
+ else
+ buf += sprintf(buf, "%s", pm_disk_modes[i]);
+ if (i+1 != PM_DISK_MAX)
+ buf += sprintf(buf, " ");
+ }
+ buf += sprintf(buf, "\n");
+ return buf-start;
}
-static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
+static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
{
int error = 0;
int i;
@@ -338,17 +393,21 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
len = p ? p - buf : n;
mutex_lock(&pm_mutex);
- for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
+ for (i = PM_DISK_PLATFORM; i < PM_DISK_MAX; i++) {
if (!strncmp(buf, pm_disk_modes[i], len)) {
mode = i;
break;
}
}
if (mode) {
- if (mode == PM_DISK_SHUTDOWN || mode == PM_DISK_REBOOT ||
- mode == PM_DISK_TEST || mode == PM_DISK_TESTPROC) {
+ switch (mode) {
+ case PM_DISK_SHUTDOWN:
+ case PM_DISK_REBOOT:
+ case PM_DISK_TEST:
+ case PM_DISK_TESTPROC:
pm_disk_mode = mode;
- } else {
+ break;
+ default:
if (pm_ops && pm_ops->enter &&
(mode == pm_ops->pm_disk_mode))
pm_disk_mode = mode;
@@ -367,13 +426,13 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
power_attr(disk);
-static ssize_t resume_show(struct subsystem * subsys, char *buf)
+static ssize_t resume_show(struct kset *kset, char *buf)
{
return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
MINOR(swsusp_resume_device));
}
-static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
+static ssize_t resume_store(struct kset *kset, const char *buf, size_t n)
{
unsigned int maj, min;
dev_t res;
@@ -399,12 +458,12 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n)
power_attr(resume);
-static ssize_t image_size_show(struct subsystem * subsys, char *buf)
+static ssize_t image_size_show(struct kset *kset, char *buf)
{
return sprintf(buf, "%lu\n", image_size);
}
-static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n)
+static ssize_t image_size_store(struct kset *kset, const char *buf, size_t n)
{
unsigned long size;
@@ -433,7 +492,7 @@ static struct attribute_group attr_group = {
static int __init pm_disk_init(void)
{
- return sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
+ return sysfs_create_group(&power_subsys.kobj, &attr_group);
}
core_initcall(pm_disk_init);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index a064dfd8877..f6dda685e7e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -30,7 +30,7 @@
DEFINE_MUTEX(pm_mutex);
struct pm_ops *pm_ops;
-suspend_disk_method_t pm_disk_mode = PM_DISK_PLATFORM;
+suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
/**
* pm_set_ops - Set the global power method table.
@@ -41,9 +41,26 @@ void pm_set_ops(struct pm_ops * ops)
{
mutex_lock(&pm_mutex);
pm_ops = ops;
+ if (ops && ops->pm_disk_mode != PM_DISK_INVALID) {
+ pm_disk_mode = ops->pm_disk_mode;
+ } else
+ pm_disk_mode = PM_DISK_SHUTDOWN;
mutex_unlock(&pm_mutex);
}
+/**
+ * pm_valid_only_mem - generic memory-only valid callback
+ *
+ * pm_ops drivers that implement mem suspend only and only need
+ * to check for that in their .valid callback can use this instead
+ * of rolling their own .valid callback.
+ */
+int pm_valid_only_mem(suspend_state_t state)
+{
+ return state == PM_SUSPEND_MEM;
+}
+
+
static inline void pm_finish(suspend_state_t state)
{
if (pm_ops->finish)
@@ -111,13 +128,24 @@ static int suspend_prepare(suspend_state_t state)
return error;
}
+/* default implementation */
+void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
+{
+ local_irq_disable();
+}
+
+/* default implementation */
+void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
+{
+ local_irq_enable();
+}
int suspend_enter(suspend_state_t state)
{
int error = 0;
- unsigned long flags;
- local_irq_save(flags);
+ arch_suspend_disable_irqs();
+ BUG_ON(!irqs_disabled());
if ((error = device_power_down(PMSG_SUSPEND))) {
printk(KERN_ERR "Some devices failed to power down\n");
@@ -126,7 +154,8 @@ int suspend_enter(suspend_state_t state)
error = pm_ops->enter(state);
device_power_up();
Done:
- local_irq_restore(flags);
+ arch_suspend_enable_irqs();
+ BUG_ON(irqs_disabled());
return error;
}
@@ -155,22 +184,26 @@ static void suspend_finish(suspend_state_t state)
static const char * const pm_states[PM_SUSPEND_MAX] = {
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
-#ifdef CONFIG_SOFTWARE_SUSPEND
[PM_SUSPEND_DISK] = "disk",
-#endif
};
static inline int valid_state(suspend_state_t state)
{
/* Suspend-to-disk does not really need low-level support.
- * It can work with reboot if needed. */
+ * It can work with shutdown/reboot if needed. If it isn't
+ * configured, then it cannot be supported.
+ */
if (state == PM_SUSPEND_DISK)
+#ifdef CONFIG_SOFTWARE_SUSPEND
return 1;
+#else
+ return 0;
+#endif
/* all other states need lowlevel support and need to be
* valid to the lowlevel implementation, no valid callback
- * implies that all are valid. */
- if (!pm_ops || (pm_ops->valid && !pm_ops->valid(state)))
+ * implies that none are valid. */
+ if (!pm_ops || !pm_ops->valid || !pm_ops->valid(state))
return 0;
return 1;
}
@@ -215,15 +248,6 @@ static int enter_state(suspend_state_t state)
return error;
}
-/*
- * This is main interface to the outside world. It needs to be
- * called from process context.
- */
-int software_suspend(void)
-{
- return enter_state(PM_SUSPEND_DISK);
-}
-
/**
* pm_suspend - Externally visible function for suspending system.
@@ -256,7 +280,7 @@ decl_subsys(power,NULL,NULL);
* proper enumerated value, and initiates a suspend transition.
*/
-static ssize_t state_show(struct subsystem * subsys, char * buf)
+static ssize_t state_show(struct kset *kset, char *buf)
{
int i;
char * s = buf;
@@ -269,7 +293,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
return (s - buf);
}
-static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
+static ssize_t state_store(struct kset *kset, const char *buf, size_t n)
{
suspend_state_t state = PM_SUSPEND_STANDBY;
const char * const *s;
@@ -296,13 +320,13 @@ power_attr(state);
#ifdef CONFIG_PM_TRACE
int pm_trace_enabled;
-static ssize_t pm_trace_show(struct subsystem * subsys, char * buf)
+static ssize_t pm_trace_show(struct kset *kset, char *buf)
{
return sprintf(buf, "%d\n", pm_trace_enabled);
}
static ssize_t
-pm_trace_store(struct subsystem * subsys, const char * buf, size_t n)
+pm_trace_store(struct kset *kset, const char *buf, size_t n)
{
int val;
@@ -336,7 +360,7 @@ static int __init pm_init(void)
{
int error = subsystem_register(&power_subsys);
if (!error)
- error = sysfs_create_group(&power_subsys.kset.kobj,&attr_group);
+ error = sysfs_create_group(&power_subsys.kobj,&attr_group);
return error;
}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index eb461b816bf..34b43542785 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -14,8 +14,18 @@ struct swsusp_info {
#ifdef CONFIG_SOFTWARE_SUSPEND
-extern int pm_suspend_disk(void);
+/*
+ * Keep some memory free so that I/O operations can succeed without paging
+ * [Might this be more than 4 MB?]
+ */
+#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT)
+/*
+ * Keep 1 MB of memory free so that device drivers can allocate some pages in
+ * their .suspend() routines without breaking the suspend to disk.
+ */
+#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT)
+extern int pm_suspend_disk(void);
#else
static inline int pm_suspend_disk(void)
{
@@ -23,6 +33,8 @@ static inline int pm_suspend_disk(void)
}
#endif
+extern int pfn_is_nosave(unsigned long);
+
extern struct mutex pm_mutex;
#define power_attr(_name) \
@@ -35,10 +47,7 @@ static struct subsys_attribute _name##_attr = { \
.store = _name##_store, \
}
-extern struct subsystem power_subsys;
-
-/* References to section boundaries */
-extern const void __nosave_begin, __nosave_end;
+extern struct kset power_subsys;
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
@@ -49,6 +58,8 @@ extern sector_t swsusp_resume_block;
extern asmlinkage int swsusp_arch_suspend(void);
extern asmlinkage int swsusp_arch_resume(void);
+extern int create_basic_memory_bitmaps(void);
+extern void free_basic_memory_bitmaps(void);
extern unsigned int count_data_pages(void);
/**
@@ -139,30 +150,12 @@ struct resume_swap_area {
#define PMOPS_ENTER 2
#define PMOPS_FINISH 3
-/**
- * The bitmap is used for tracing allocated swap pages
- *
- * The entire bitmap consists of a number of bitmap_page
- * structures linked with the help of the .next member.
- * Thus each page can be allocated individually, so we only
- * need to make 0-order memory allocations to create
- * the bitmap.
- */
-
-#define BITMAP_PAGE_SIZE (PAGE_SIZE - sizeof(void *))
-#define BITMAP_PAGE_CHUNKS (BITMAP_PAGE_SIZE / sizeof(long))
-#define BITS_PER_CHUNK (sizeof(long) * 8)
-#define BITMAP_PAGE_BITS (BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
-
-struct bitmap_page {
- unsigned long chunks[BITMAP_PAGE_CHUNKS];
- struct bitmap_page *next;
-};
+/* If unset, the snapshot device cannot be open. */
+extern atomic_t snapshot_device_available;
-extern void free_bitmap(struct bitmap_page *bitmap);
-extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
-extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap);
-extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
+extern sector_t alloc_swapdev_block(int swap);
+extern void free_all_swap_pages(int swap);
+extern int swsusp_swap_in_use(void);
extern int swsusp_check(void);
extern int swsusp_shrink_memory(void);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 6d566bf7085..08841938738 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -8,7 +8,6 @@
#undef DEBUG
-#include <linux/smp_lock.h>
#include <linux/interrupt.h>
#include <linux/suspend.h>
#include <linux/module.h>
@@ -25,10 +24,9 @@
static inline int freezeable(struct task_struct * p)
{
- if ((p == current) ||
+ if ((p == current) ||
(p->flags & PF_NOFREEZE) ||
- (p->exit_state == EXIT_ZOMBIE) ||
- (p->exit_state == EXIT_DEAD))
+ (p->exit_state != 0))
return 0;
return 1;
}
@@ -47,8 +45,10 @@ void refrigerator(void)
recalc_sigpending(); /* We sent fake signal, clean it up */
spin_unlock_irq(&current->sighand->siglock);
- while (frozen(current)) {
- current->state = TASK_UNINTERRUPTIBLE;
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!frozen(current))
+ break;
schedule();
}
pr_debug("%s left refrigerator\n", current->comm);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index fc53ad06812..b7039772b05 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -14,13 +14,13 @@
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/suspend.h>
-#include <linux/smp_lock.h>
#include <linux/delay.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <linux/kernel.h>
#include <linux/pm.h>
#include <linux/device.h>
+#include <linux/init.h>
#include <linux/bootmem.h>
#include <linux/syscalls.h>
#include <linux/console.h>
@@ -34,6 +34,10 @@
#include "power.h"
+static int swsusp_page_is_free(struct page *);
+static void swsusp_set_page_forbidden(struct page *);
+static void swsusp_unset_page_forbidden(struct page *);
+
/* List of PBEs needed for restoring the pages that were allocated before
* the suspend and included in the suspend image, but have also been
* allocated by the "resume" kernel, so their contents cannot be written
@@ -67,15 +71,15 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
res = (void *)get_zeroed_page(gfp_mask);
if (safe_needed)
- while (res && PageNosaveFree(virt_to_page(res))) {
+ while (res && swsusp_page_is_free(virt_to_page(res))) {
/* The page is unsafe, mark it for swsusp_free() */
- SetPageNosave(virt_to_page(res));
+ swsusp_set_page_forbidden(virt_to_page(res));
allocated_unsafe_pages++;
res = (void *)get_zeroed_page(gfp_mask);
}
if (res) {
- SetPageNosave(virt_to_page(res));
- SetPageNosaveFree(virt_to_page(res));
+ swsusp_set_page_forbidden(virt_to_page(res));
+ swsusp_set_page_free(virt_to_page(res));
}
return res;
}
@@ -91,8 +95,8 @@ static struct page *alloc_image_page(gfp_t gfp_mask)
page = alloc_page(gfp_mask);
if (page) {
- SetPageNosave(page);
- SetPageNosaveFree(page);
+ swsusp_set_page_forbidden(page);
+ swsusp_set_page_free(page);
}
return page;
}
@@ -110,9 +114,9 @@ static inline void free_image_page(void *addr, int clear_nosave_free)
page = virt_to_page(addr);
- ClearPageNosave(page);
+ swsusp_unset_page_forbidden(page);
if (clear_nosave_free)
- ClearPageNosaveFree(page);
+ swsusp_unset_page_free(page);
__free_page(page);
}
@@ -224,11 +228,6 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
* of type unsigned long each). It also contains the pfns that
* correspond to the start and end of the represented memory area and
* the number of bit chunks in the block.
- *
- * NOTE: Memory bitmaps are used for two types of operations only:
- * "set a bit" and "find the next bit set". Moreover, the searching
- * is always carried out after all of the "set a bit" operations
- * on given bitmap.
*/
#define BM_END_OF_MAP (~0UL)
@@ -443,15 +442,13 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
}
/**
- * memory_bm_set_bit - set the bit in the bitmap @bm that corresponds
+ * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds
* to given pfn. The cur_zone_bm member of @bm and the cur_block member
* of @bm->cur_zone_bm are updated.
- *
- * If the bit cannot be set, the function returns -EINVAL .
*/
-static int
-memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
+static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+ void **addr, unsigned int *bit_nr)
{
struct zone_bitmap *zone_bm;
struct bm_block *bb;
@@ -463,8 +460,8 @@ memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
/* We don't assume that the zones are sorted by pfns */
while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
zone_bm = zone_bm->next;
- if (unlikely(!zone_bm))
- return -EINVAL;
+
+ BUG_ON(!zone_bm);
}
bm->cur.zone_bm = zone_bm;
}
@@ -475,13 +472,40 @@ memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
while (pfn >= bb->end_pfn) {
bb = bb->next;
- if (unlikely(!bb))
- return -EINVAL;
+
+ BUG_ON(!bb);
}
zone_bm->cur_block = bb;
pfn -= bb->start_pfn;
- set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK);
- return 0;
+ *bit_nr = pfn % BM_BITS_PER_CHUNK;
+ *addr = bb->data + pfn / BM_BITS_PER_CHUNK;
+}
+
+static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+
+ memory_bm_find_bit(bm, pfn, &addr, &bit);
+ set_bit(bit, addr);
+}
+
+static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+
+ memory_bm_find_bit(bm, pfn, &addr, &bit);
+ clear_bit(bit, addr);
+}
+
+static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
+{
+ void *addr;
+ unsigned int bit;
+
+ memory_bm_find_bit(bm, pfn, &addr, &bit);
+ return test_bit(bit, addr);
}
/* Two auxiliary functions for memory_bm_next_pfn */
@@ -564,6 +588,199 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
}
/**
+ * This structure represents a range of page frames the contents of which
+ * should not be saved during the suspend.
+ */
+
+struct nosave_region {
+ struct list_head list;
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+};
+
+static LIST_HEAD(nosave_regions);
+
+/**
+ * register_nosave_region - register a range of page frames the contents
+ * of which should not be saved during the suspend (to be used in the early
+ * initialization code)
+ */
+
+void __init
+register_nosave_region(unsigned long start_pfn, unsigned long end_pfn)
+{
+ struct nosave_region *region;
+
+ if (start_pfn >= end_pfn)
+ return;
+
+ if (!list_empty(&nosave_regions)) {
+ /* Try to extend the previous region (they should be sorted) */
+ region = list_entry(nosave_regions.prev,
+ struct nosave_region, list);
+ if (region->end_pfn == start_pfn) {
+ region->end_pfn = end_pfn;
+ goto Report;
+ }
+ }
+ /* This allocation cannot fail */
+ region = alloc_bootmem_low(sizeof(struct nosave_region));
+ region->start_pfn = start_pfn;
+ region->end_pfn = end_pfn;
+ list_add_tail(&region->list, &nosave_regions);
+ Report:
+ printk("swsusp: Registered nosave memory region: %016lx - %016lx\n",
+ start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
+}
+
+/*
+ * Set bits in this map correspond to the page frames the contents of which
+ * should not be saved during the suspend.
+ */
+static struct memory_bitmap *forbidden_pages_map;
+
+/* Set bits in this map correspond to free page frames. */
+static struct memory_bitmap *free_pages_map;
+
+/*
+ * Each page frame allocated for creating the image is marked by setting the
+ * corresponding bits in forbidden_pages_map and free_pages_map simultaneously
+ */
+
+void swsusp_set_page_free(struct page *page)
+{
+ if (free_pages_map)
+ memory_bm_set_bit(free_pages_map, page_to_pfn(page));
+}
+
+static int swsusp_page_is_free(struct page *page)
+{
+ return free_pages_map ?
+ memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0;
+}
+
+void swsusp_unset_page_free(struct page *page)
+{
+ if (free_pages_map)
+ memory_bm_clear_bit(free_pages_map, page_to_pfn(page));
+}
+
+static void swsusp_set_page_forbidden(struct page *page)
+{
+ if (forbidden_pages_map)
+ memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page));
+}
+
+int swsusp_page_is_forbidden(struct page *page)
+{
+ return forbidden_pages_map ?
+ memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0;
+}
+
+static void swsusp_unset_page_forbidden(struct page *page)
+{
+ if (forbidden_pages_map)
+ memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page));
+}
+
+/**
+ * mark_nosave_pages - set bits corresponding to the page frames the
+ * contents of which should not be saved in a given bitmap.
+ */
+
+static void mark_nosave_pages(struct memory_bitmap *bm)
+{
+ struct nosave_region *region;
+
+ if (list_empty(&nosave_regions))
+ return;
+
+ list_for_each_entry(region, &nosave_regions, list) {
+ unsigned long pfn;
+
+ printk("swsusp: Marking nosave pages: %016lx - %016lx\n",
+ region->start_pfn << PAGE_SHIFT,
+ region->end_pfn << PAGE_SHIFT);
+
+ for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
+ memory_bm_set_bit(bm, pfn);
+ }
+}
+
+/**
+ * create_basic_memory_bitmaps - create bitmaps needed for marking page
+ * frames that should not be saved and free page frames. The pointers
+ * forbidden_pages_map and free_pages_map are only modified if everything
+ * goes well, because we don't want the bits to be used before both bitmaps
+ * are set up.
+ */
+
+int create_basic_memory_bitmaps(void)
+{
+ struct memory_bitmap *bm1, *bm2;
+ int error = 0;
+
+ BUG_ON(forbidden_pages_map || free_pages_map);
+
+ bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
+ if (!bm1)
+ return -ENOMEM;
+
+ error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY);
+ if (error)
+ goto Free_first_object;
+
+ bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
+ if (!bm2)
+ goto Free_first_bitmap;
+
+ error = memory_bm_create(bm2, GFP_KERNEL, PG_ANY);
+ if (error)
+ goto Free_second_object;
+
+ forbidden_pages_map = bm1;
+ free_pages_map = bm2;
+ mark_nosave_pages(forbidden_pages_map);
+
+ printk("swsusp: Basic memory bitmaps created\n");
+
+ return 0;
+
+ Free_second_object:
+ kfree(bm2);
+ Free_first_bitmap:
+ memory_bm_free(bm1, PG_UNSAFE_CLEAR);
+ Free_first_object:
+ kfree(bm1);
+ return -ENOMEM;
+}
+
+/**
+ * free_basic_memory_bitmaps - free memory bitmaps allocated by
+ * create_basic_memory_bitmaps(). The auxiliary pointers are necessary
+ * so that the bitmaps themselves are not referred to while they are being
+ * freed.
+ */
+
+void free_basic_memory_bitmaps(void)
+{
+ struct memory_bitmap *bm1, *bm2;
+
+ BUG_ON(!(forbidden_pages_map && free_pages_map));
+
+ bm1 = forbidden_pages_map;
+ bm2 = free_pages_map;
+ forbidden_pages_map = NULL;
+ free_pages_map = NULL;
+ memory_bm_free(bm1, PG_UNSAFE_CLEAR);
+ kfree(bm1);
+ memory_bm_free(bm2, PG_UNSAFE_CLEAR);
+ kfree(bm2);
+
+ printk("swsusp: Basic memory bitmaps freed\n");
+}
+
+/**
* snapshot_additional_pages - estimate the number of additional pages
* be needed for setting up the suspend image data structures for given
* zone (usually the returned value is greater than the exact number)
@@ -615,7 +832,8 @@ static struct page *saveable_highmem_page(unsigned long pfn)
BUG_ON(!PageHighMem(page));
- if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page))
+ if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) ||
+ PageReserved(page))
return NULL;
return page;
@@ -651,17 +869,6 @@ static inline unsigned int count_highmem_pages(void) { return 0; }
#endif /* CONFIG_HIGHMEM */
/**
- * pfn_is_nosave - check if given pfn is in the 'nosave' section
- */
-
-static inline int pfn_is_nosave(unsigned long pfn)
-{
- unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
- unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
- return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
-}
-
-/**
* saveable - Determine whether a non-highmem page should be included in
* the suspend image.
*
@@ -681,7 +888,7 @@ static struct page *saveable_page(unsigned long pfn)
BUG_ON(PageHighMem(page));
- if (PageNosave(page) || PageNosaveFree(page))
+ if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page))
return NULL;
if (PageReserved(page) && pfn_is_nosave(pfn))
@@ -821,9 +1028,10 @@ void swsusp_free(void)
if (pfn_valid(pfn)) {
struct page *page = pfn_to_page(pfn);
- if (PageNosave(page) && PageNosaveFree(page)) {
- ClearPageNosave(page);
- ClearPageNosaveFree(page);
+ if (swsusp_page_is_forbidden(page) &&
+ swsusp_page_is_free(page)) {
+ swsusp_unset_page_forbidden(page);
+ swsusp_unset_page_free(page);
__free_page(page);
}
}
@@ -1146,7 +1354,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn))
- ClearPageNosaveFree(pfn_to_page(pfn));
+ swsusp_unset_page_free(pfn_to_page(pfn));
}
/* Mark pages that correspond to the "original" pfns as "unsafe" */
@@ -1155,7 +1363,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
pfn = memory_bm_next_pfn(bm);
if (likely(pfn != BM_END_OF_MAP)) {
if (likely(pfn_valid(pfn)))
- SetPageNosaveFree(pfn_to_page(pfn));
+ swsusp_set_page_free(pfn_to_page(pfn));
else
return -EFAULT;
}
@@ -1321,14 +1529,14 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p)
struct page *page;
page = alloc_page(__GFP_HIGHMEM);
- if (!PageNosaveFree(page)) {
+ if (!swsusp_page_is_free(page)) {
/* The page is "safe", set its bit the bitmap */
memory_bm_set_bit(bm, page_to_pfn(page));
safe_highmem_pages++;
}
/* Mark the page as allocated */
- SetPageNosave(page);
- SetPageNosaveFree(page);
+ swsusp_set_page_forbidden(page);
+ swsusp_set_page_free(page);
}
memory_bm_position_reset(bm);
safe_highmem_bm = bm;
@@ -1360,7 +1568,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca)
struct highmem_pbe *pbe;
void *kaddr;
- if (PageNosave(page) && PageNosaveFree(page)) {
+ if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) {
/* We have allocated the "original" page frame and we can
* use it directly to store the loaded page.
*/
@@ -1522,14 +1730,14 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
error = -ENOMEM;
goto Free;
}
- if (!PageNosaveFree(virt_to_page(lp))) {
+ if (!swsusp_page_is_free(virt_to_page(lp))) {
/* The page is "safe", add it to the list */
lp->next = safe_pages_list;
safe_pages_list = lp;
}
/* Mark the page as allocated */
- SetPageNosave(virt_to_page(lp));
- SetPageNosaveFree(virt_to_page(lp));
+ swsusp_set_page_forbidden(virt_to_page(lp));
+ swsusp_set_page_free(virt_to_page(lp));
nr_pages--;
}
/* Free the reserved safe pages so that chain_alloc() can use them */
@@ -1558,7 +1766,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
if (PageHighMem(page))
return get_highmem_page_buffer(page, ca);
- if (PageNosave(page) && PageNosaveFree(page))
+ if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page))
/* We have allocated the "original" page frame and we can
* use it directly to store the loaded page.
*/
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 3581f8f86ac..b8b235cc19d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -12,7 +12,6 @@
*/
#include <linux/module.h>
-#include <linux/smp_lock.h>
#include <linux/file.h>
#include <linux/utsname.h>
#include <linux/version.h>
@@ -33,12 +32,14 @@ extern char resume_file[];
#define SWSUSP_SIG "S1SUSPEND"
-static struct swsusp_header {
+struct swsusp_header {
char reserved[PAGE_SIZE - 20 - sizeof(sector_t)];
sector_t image;
char orig_sig[10];
char sig[10];
-} __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
+} __attribute__((packed));
+
+static struct swsusp_header *swsusp_header;
/*
* General things
@@ -141,14 +142,14 @@ static int mark_swapfiles(sector_t start)
{
int error;
- bio_read_page(swsusp_resume_block, &swsusp_header, NULL);
- if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
- !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
- memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
- memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
- swsusp_header.image = start;
+ bio_read_page(swsusp_resume_block, swsusp_header, NULL);
+ if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
+ !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
+ memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
+ memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
+ swsusp_header->image = start;
error = bio_write_page(swsusp_resume_block,
- &swsusp_header, NULL);
+ swsusp_header, NULL);
} else {
printk(KERN_ERR "swsusp: Swap header not found!\n");
error = -ENODEV;
@@ -241,7 +242,6 @@ struct swap_map_page {
struct swap_map_handle {
struct swap_map_page *cur;
sector_t cur_swap;
- struct bitmap_page *bitmap;
unsigned int k;
};
@@ -250,9 +250,6 @@ static void release_swap_writer(struct swap_map_handle *handle)
if (handle->cur)
free_page((unsigned long)handle->cur);
handle->cur = NULL;
- if (handle->bitmap)
- free_bitmap(handle->bitmap);
- handle->bitmap = NULL;
}
static int get_swap_writer(struct swap_map_handle *handle)
@@ -260,12 +257,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
if (!handle->cur)
return -ENOMEM;
- handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
- if (!handle->bitmap) {
- release_swap_writer(handle);
- return -ENOMEM;
- }
- handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap);
+ handle->cur_swap = alloc_swapdev_block(root_swap);
if (!handle->cur_swap) {
release_swap_writer(handle);
return -ENOSPC;
@@ -282,7 +274,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
if (!handle->cur)
return -EINVAL;
- offset = alloc_swapdev_block(root_swap, handle->bitmap);
+ offset = alloc_swapdev_block(root_swap);
error = write_page(buf, offset, bio_chain);
if (error)
return error;
@@ -291,7 +283,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
error = wait_on_bio_chain(bio_chain);
if (error)
goto out;
- offset = alloc_swapdev_block(root_swap, handle->bitmap);
+ offset = alloc_swapdev_block(root_swap);
if (!offset)
return -ENOSPC;
handle->cur->next_swap = offset;
@@ -428,7 +420,8 @@ int swsusp_write(void)
}
}
if (error)
- free_all_swap_pages(root_swap, handle.bitmap);
+ free_all_swap_pages(root_swap);
+
release_swap_writer(&handle);
out:
swsusp_close();
@@ -564,7 +557,7 @@ int swsusp_read(void)
if (error < PAGE_SIZE)
return error < 0 ? error : -EFAULT;
header = (struct swsusp_info *)data_of(snapshot);
- error = get_swap_reader(&handle, swsusp_header.image);
+ error = get_swap_reader(&handle, swsusp_header->image);
if (!error)
error = swap_read_page(&handle, header, NULL);
if (!error)
@@ -591,17 +584,17 @@ int swsusp_check(void)
resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
if (!IS_ERR(resume_bdev)) {
set_blocksize(resume_bdev, PAGE_SIZE);
- memset(&swsusp_header, 0, sizeof(swsusp_header));
+ memset(swsusp_header, 0, sizeof(PAGE_SIZE));
error = bio_read_page(swsusp_resume_block,
- &swsusp_header, NULL);
+ swsusp_header, NULL);
if (error)
return error;
- if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
- memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
+ if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
+ memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
/* Reset swap signature now */
error = bio_write_page(swsusp_resume_block,
- &swsusp_header, NULL);
+ swsusp_header, NULL);
} else {
return -EINVAL;
}
@@ -632,3 +625,13 @@ void swsusp_close(void)
blkdev_put(resume_bdev);
}
+
+static int swsusp_header_init(void)
+{
+ swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL);
+ if (!swsusp_header)
+ panic("Could not allocate memory for swsusp_header\n");
+ return 0;
+}
+
+core_initcall(swsusp_header_init);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 7fb834397a0..5da304c8f1f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -50,6 +50,7 @@
#include <linux/syscalls.h>
#include <linux/highmem.h>
#include <linux/time.h>
+#include <linux/rbtree.h>
#include "power.h"
@@ -74,72 +75,69 @@ static inline unsigned int count_highmem_pages(void) { return 0; }
/**
* The following functions are used for tracing the allocated
* swap pages, so that they can be freed in case of an error.
- *
- * The functions operate on a linked bitmap structure defined
- * in power.h
*/
-void free_bitmap(struct bitmap_page *bitmap)
-{
- struct bitmap_page *bp;
+struct swsusp_extent {
+ struct rb_node node;
+ unsigned long start;
+ unsigned long end;
+};
- while (bitmap) {
- bp = bitmap->next;
- free_page((unsigned long)bitmap);
- bitmap = bp;
- }
-}
+static struct rb_root swsusp_extents = RB_ROOT;
-struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
+static int swsusp_extents_insert(unsigned long swap_offset)
{
- struct bitmap_page *bitmap, *bp;
- unsigned int n;
-
- if (!nr_bits)
- return NULL;
-
- bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
- bp = bitmap;
- for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) {
- bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
- bp = bp->next;
- if (!bp) {
- free_bitmap(bitmap);
- return NULL;
+ struct rb_node **new = &(swsusp_extents.rb_node);
+ struct rb_node *parent = NULL;
+ struct swsusp_extent *ext;
+
+ /* Figure out where to put the new node */
+ while (*new) {
+ ext = container_of(*new, struct swsusp_extent, node);
+ parent = *new;
+ if (swap_offset < ext->start) {
+ /* Try to merge */
+ if (swap_offset == ext->start - 1) {
+ ext->start--;
+ return 0;
+ }
+ new = &((*new)->rb_left);
+ } else if (swap_offset > ext->end) {
+ /* Try to merge */
+ if (swap_offset == ext->end + 1) {
+ ext->end++;
+ return 0;
+ }
+ new = &((*new)->rb_right);
+ } else {
+ /* It already is in the tree */
+ return -EINVAL;
}
}
- return bitmap;
-}
-
-static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
-{
- unsigned int n;
-
- n = BITMAP_PAGE_BITS;
- while (bitmap && n <= bit) {
- n += BITMAP_PAGE_BITS;
- bitmap = bitmap->next;
- }
- if (!bitmap)
- return -EINVAL;
- n -= BITMAP_PAGE_BITS;
- bit -= n;
- n = 0;
- while (bit >= BITS_PER_CHUNK) {
- bit -= BITS_PER_CHUNK;
- n++;
- }
- bitmap->chunks[n] |= (1UL << bit);
+ /* Add the new node and rebalance the tree. */
+ ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL);
+ if (!ext)
+ return -ENOMEM;
+
+ ext->start = swap_offset;
+ ext->end = swap_offset;
+ rb_link_node(&ext->node, parent, new);
+ rb_insert_color(&ext->node, &swsusp_extents);
return 0;
}
-sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
+/**
+ * alloc_swapdev_block - allocate a swap page and register that it has
+ * been allocated, so that it can be freed in case of an error.
+ */
+
+sector_t alloc_swapdev_block(int swap)
{
unsigned long offset;
offset = swp_offset(get_swap_page_of_type(swap));
if (offset) {
- if (bitmap_set(bitmap, offset))
+ if (swsusp_extents_insert(offset))
swap_free(swp_entry(swap, offset));
else
return swapdev_block(swap, offset);
@@ -147,23 +145,34 @@ sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap)
return 0;
}
-void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
+/**
+ * free_all_swap_pages - free swap pages allocated for saving image data.
+ * It also frees the extents used to register which swap entres had been
+ * allocated.
+ */
+
+void free_all_swap_pages(int swap)
{
- unsigned int bit, n;
- unsigned long test;
-
- bit = 0;
- while (bitmap) {
- for (n = 0; n < BITMAP_PAGE_CHUNKS; n++)
- for (test = 1UL; test; test <<= 1) {
- if (bitmap->chunks[n] & test)
- swap_free(swp_entry(swap, bit));
- bit++;
- }
- bitmap = bitmap->next;
+ struct rb_node *node;
+
+ while ((node = swsusp_extents.rb_node)) {
+ struct swsusp_extent *ext;
+ unsigned long offset;
+
+ ext = container_of(node, struct swsusp_extent, node);
+ rb_erase(node, &swsusp_extents);
+ for (offset = ext->start; offset <= ext->end; offset++)
+ swap_free(swp_entry(swap, offset));
+
+ kfree(ext);
}
}
+int swsusp_swap_in_use(void)
+{
+ return (swsusp_extents.rb_node != NULL);
+}
+
/**
* swsusp_show_speed - print the time elapsed between two events represented by
* @start and @stop
@@ -224,18 +233,18 @@ int swsusp_shrink_memory(void)
long size, highmem_size;
highmem_size = count_highmem_pages();
- size = count_data_pages() + PAGES_FOR_IO;
+ size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
tmp = size;
size += highmem_size;
for_each_zone (zone)
if (populated_zone(zone)) {
+ tmp += snapshot_additional_pages(zone);
if (is_highmem(zone)) {
highmem_size -=
zone_page_state(zone, NR_FREE_PAGES);
} else {
tmp -= zone_page_state(zone, NR_FREE_PAGES);
tmp += zone->lowmem_reserve[ZONE_NORMAL];
- tmp += snapshot_additional_pages(zone);
}
}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index dd09efe7df5..040560d9c31 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -33,25 +33,29 @@
static struct snapshot_data {
struct snapshot_handle handle;
int swap;
- struct bitmap_page *bitmap;
int mode;
char frozen;
char ready;
char platform_suspend;
} snapshot_state;
-static atomic_t device_available = ATOMIC_INIT(1);
+atomic_t snapshot_device_available = ATOMIC_INIT(1);
static int snapshot_open(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
- if (!atomic_add_unless(&device_available, -1, 0))
+ if (!atomic_add_unless(&snapshot_device_available, -1, 0))
return -EBUSY;
- if ((filp->f_flags & O_ACCMODE) == O_RDWR)
+ if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
+ atomic_inc(&snapshot_device_available);
return -ENOSYS;
-
+ }
+ if(create_basic_memory_bitmaps()) {
+ atomic_inc(&snapshot_device_available);
+ return -ENOMEM;
+ }
nonseekable_open(inode, filp);
data = &snapshot_state;
filp->private_data = data;
@@ -64,7 +68,6 @@ static int snapshot_open(struct inode *inode, struct file *filp)
data->swap = -1;
data->mode = O_WRONLY;
}
- data->bitmap = NULL;
data->frozen = 0;
data->ready = 0;
data->platform_suspend = 0;
@@ -77,16 +80,15 @@ static int snapshot_release(struct inode *inode, struct file *filp)
struct snapshot_data *data;
swsusp_free();
+ free_basic_memory_bitmaps();
data = filp->private_data;
- free_all_swap_pages(data->swap, data->bitmap);
- free_bitmap(data->bitmap);
+ free_all_swap_pages(data->swap);
if (data->frozen) {
mutex_lock(&pm_mutex);
thaw_processes();
- enable_nonboot_cpus();
mutex_unlock(&pm_mutex);
}
- atomic_inc(&device_available);
+ atomic_inc(&snapshot_device_available);
return 0;
}
@@ -294,14 +296,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
error = -ENODEV;
break;
}
- if (!data->bitmap) {
- data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0));
- if (!data->bitmap) {
- error = -ENOMEM;
- break;
- }
- }
- offset = alloc_swapdev_block(data->swap, data->bitmap);
+ offset = alloc_swapdev_block(data->swap);
if (offset) {
offset <<= PAGE_SHIFT;
error = put_user(offset, (sector_t __user *)arg);
@@ -315,13 +310,11 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
error = -ENODEV;
break;
}
- free_all_swap_pages(data->swap, data->bitmap);
- free_bitmap(data->bitmap);
- data->bitmap = NULL;
+ free_all_swap_pages(data->swap);
break;
case SNAPSHOT_SET_SWAP_FILE:
- if (!data->bitmap) {
+ if (!swsusp_swap_in_use()) {
/*
* User space encodes device types as two-byte values,
* so we need to recode them
@@ -368,9 +361,12 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
if (error) {
printk(KERN_ERR "Failed to suspend some devices.\n");
} else {
- /* Enter S3, system is already frozen */
- suspend_enter(PM_SUSPEND_MEM);
-
+ error = disable_nonboot_cpus();
+ if (!error) {
+ /* Enter S3, system is already frozen */
+ suspend_enter(PM_SUSPEND_MEM);
+ enable_nonboot_cpus();
+ }
/* Wake up devices */
device_resume();
}
@@ -417,7 +413,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
break;
case SNAPSHOT_SET_SWAP_AREA:
- if (data->bitmap) {
+ if (swsusp_swap_in_use()) {
error = -EPERM;
} else {
struct resume_swap_area swap_area;
diff --git a/kernel/printk.c b/kernel/printk.c
index 4b47e59248d..0bbdeac2810 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -20,7 +20,6 @@
#include <linux/mm.h>
#include <linux/tty.h>
#include <linux/tty_driver.h>
-#include <linux/smp_lock.h>
#include <linux/console.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -931,8 +930,16 @@ void register_console(struct console *console)
{
int i;
unsigned long flags;
+ struct console *bootconsole = NULL;
- if (preferred_console < 0)
+ if (console_drivers) {
+ if (console->flags & CON_BOOT)
+ return;
+ if (console_drivers->flags & CON_BOOT)
+ bootconsole = console_drivers;
+ }
+
+ if (preferred_console < 0 || bootconsole || !console_drivers)
preferred_console = selected_console;
/*
@@ -978,8 +985,11 @@ void register_console(struct console *console)
if (!(console->flags & CON_ENABLED))
return;
- if (console_drivers && (console_drivers->flags & CON_BOOT)) {
- unregister_console(console_drivers);
+ if (bootconsole) {
+ printk(KERN_INFO "console handover: boot [%s%d] -> real [%s%d]\n",
+ bootconsole->name, bootconsole->index,
+ console->name, console->index);
+ unregister_console(bootconsole);
console->flags &= ~CON_PRINTBUFFER;
}
@@ -1030,16 +1040,11 @@ int unregister_console(struct console *console)
}
}
- /* If last console is removed, we re-enable picking the first
- * one that gets registered. Without that, pmac early boot console
- * would prevent fbcon from taking over.
- *
+ /*
* If this isn't the last console and it has CON_CONSDEV set, we
* need to set it on the next preferred console.
*/
- if (console_drivers == NULL)
- preferred_console = selected_console;
- else if (console->flags & CON_CONSDEV)
+ if (console_drivers != NULL && console->flags & CON_CONSDEV)
console_drivers->flags |= CON_CONSDEV;
release_console_sem();
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index bcd14e83ef3..55ba82a85a6 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -502,10 +502,6 @@ static struct rcu_torture_ops sched_ops = {
.name = "sched"
};
-static struct rcu_torture_ops *torture_ops[] =
- { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, &srcu_ops,
- &sched_ops, NULL };
-
/*
* RCU torture writer kthread. Repeatedly substitutes a new structure
* for that pointed to by rcu_torture_current, freeing the old structure
@@ -534,7 +530,7 @@ rcu_torture_writer(void *arg)
rp->rtort_mbtest = 1;
rcu_assign_pointer(rcu_torture_current, rp);
smp_wmb();
- if (old_rp != NULL) {
+ if (old_rp) {
i = old_rp->rtort_pipe_count;
if (i > RCU_TORTURE_PIPE_LEN)
i = RCU_TORTURE_PIPE_LEN;
@@ -685,7 +681,7 @@ rcu_torture_printk(char *page)
atomic_read(&rcu_torture_wcount[i]));
}
cnt += sprintf(&page[cnt], "\n");
- if (cur_ops->stats != NULL)
+ if (cur_ops->stats)
cnt += cur_ops->stats(&page[cnt]);
return cnt;
}
@@ -749,13 +745,13 @@ static void rcu_torture_shuffle_tasks(void)
set_cpus_allowed(current, tmp_mask);
- if (reader_tasks != NULL) {
+ if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
if (reader_tasks[i])
set_cpus_allowed(reader_tasks[i], tmp_mask);
}
- if (fakewriter_tasks != NULL) {
+ if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++)
if (fakewriter_tasks[i])
set_cpus_allowed(fakewriter_tasks[i], tmp_mask);
@@ -808,21 +804,21 @@ rcu_torture_cleanup(void)
int i;
fullstop = 1;
- if (shuffler_task != NULL) {
+ if (shuffler_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
kthread_stop(shuffler_task);
}
shuffler_task = NULL;
- if (writer_task != NULL) {
+ if (writer_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
kthread_stop(writer_task);
}
writer_task = NULL;
- if (reader_tasks != NULL) {
+ if (reader_tasks) {
for (i = 0; i < nrealreaders; i++) {
- if (reader_tasks[i] != NULL) {
+ if (reader_tasks[i]) {
VERBOSE_PRINTK_STRING(
"Stopping rcu_torture_reader task");
kthread_stop(reader_tasks[i]);
@@ -834,9 +830,9 @@ rcu_torture_cleanup(void)
}
rcu_torture_current = NULL;
- if (fakewriter_tasks != NULL) {
+ if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++) {
- if (fakewriter_tasks[i] != NULL) {
+ if (fakewriter_tasks[i]) {
VERBOSE_PRINTK_STRING(
"Stopping rcu_torture_fakewriter task");
kthread_stop(fakewriter_tasks[i]);
@@ -847,7 +843,7 @@ rcu_torture_cleanup(void)
fakewriter_tasks = NULL;
}
- if (stats_task != NULL) {
+ if (stats_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
kthread_stop(stats_task);
}
@@ -858,7 +854,7 @@ rcu_torture_cleanup(void)
rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
- if (cur_ops->cleanup != NULL)
+ if (cur_ops->cleanup)
cur_ops->cleanup();
if (atomic_read(&n_rcu_torture_error))
rcu_torture_print_module_parms("End of test: FAILURE");
@@ -866,27 +862,28 @@ rcu_torture_cleanup(void)
rcu_torture_print_module_parms("End of test: SUCCESS");
}
-static int
+static int __init
rcu_torture_init(void)
{
int i;
int cpu;
int firsterr = 0;
+ static struct rcu_torture_ops *torture_ops[] =
+ { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
+ &srcu_ops, &sched_ops, };
/* Process args and tell the world that the torturer is on the job. */
-
- for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
cur_ops = torture_ops[i];
- if (strcmp(torture_type, cur_ops->name) == 0) {
+ if (strcmp(torture_type, cur_ops->name) == 0)
break;
- }
}
- if (cur_ops == NULL) {
+ if (i == ARRAY_SIZE(torture_ops)) {
printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
torture_type);
return (-EINVAL);
}
- if (cur_ops->init != NULL)
+ if (cur_ops->init)
cur_ops->init(); /* no "goto unwind" prior to this point!!! */
if (nreaders >= 0)
@@ -899,7 +896,7 @@ rcu_torture_init(void)
/* Set up the freelist. */
INIT_LIST_HEAD(&rcu_torture_freelist);
- for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
+ for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) {
rcu_tortures[i].rtort_mbtest = 0;
list_add_tail(&rcu_tortures[i].rtort_free,
&rcu_torture_freelist);
diff --git a/kernel/resource.c b/kernel/resource.c
index bdb55a33f96..9bd14fd3e6d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -213,27 +213,6 @@ int request_resource(struct resource *root, struct resource *new)
EXPORT_SYMBOL(request_resource);
/**
- * ____request_resource - reserve a resource, with resource conflict returned
- * @root: root resource descriptor
- * @new: resource descriptor desired by caller
- *
- * Returns:
- * On success, NULL is returned.
- * On error, a pointer to the conflicting resource is returned.
- */
-struct resource *____request_resource(struct resource *root, struct resource *new)
-{
- struct resource *conflict;
-
- write_lock(&resource_lock);
- conflict = __request_resource(root, new);
- write_unlock(&resource_lock);
- return conflict;
-}
-
-EXPORT_SYMBOL(____request_resource);
-
-/**
* release_resource - release a previously reserved resource
* @old: resource pointer
*/
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 291ded556aa..9a87886b022 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -60,7 +60,7 @@ int down_write_trylock(struct rw_semaphore *sem)
int ret = __down_write_trylock(sem);
if (ret == 1)
- rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+ rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
return ret;
}
diff --git a/kernel/sched.c b/kernel/sched.c
index a4ca632c477..a3a04085e79 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -52,8 +52,9 @@
#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
-#include <asm/tlb.h>
+#include <linux/reciprocal_div.h>
+#include <asm/tlb.h>
#include <asm/unistd.h>
/*
@@ -168,7 +169,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
#define TASK_PREEMPTS_CURR(p, rq) \
- ((p)->prio < (rq)->curr->prio)
+ (((p)->prio < (rq)->curr->prio) && ((p)->array == (rq)->active))
#define SCALE_PRIO(x, prio) \
max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
@@ -181,6 +182,27 @@ static unsigned int static_prio_timeslice(int static_prio)
return SCALE_PRIO(DEF_TIMESLICE, static_prio);
}
+#ifdef CONFIG_SMP
+/*
+ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
+ * Since cpu_power is a 'constant', we can use a reciprocal divide.
+ */
+static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
+{
+ return reciprocal_divide(load, sg->reciprocal_cpu_power);
+}
+
+/*
+ * Each time a sched group cpu_power is changed,
+ * we must compute its reciprocal value
+ */
+static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
+{
+ sg->__cpu_power += val;
+ sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
+}
+#endif
+
/*
* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
* to time slice values: [800ms ... 100ms ... 5ms]
@@ -223,6 +245,10 @@ struct rq {
unsigned long raw_weighted_load;
#ifdef CONFIG_SMP
unsigned long cpu_load[3];
+ unsigned char idle_at_tick;
+#ifdef CONFIG_NO_HZ
+ unsigned char in_nohz_recently;
+#endif
#endif
unsigned long long nr_switches;
@@ -278,7 +304,7 @@ struct rq {
struct lock_class_key rq_lock_key;
};
-static DEFINE_PER_CPU(struct rq, runqueues);
+static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
static inline int cpu_of(struct rq *rq)
{
@@ -1049,6 +1075,17 @@ static void resched_task(struct task_struct *p)
if (!tsk_is_polling(p))
smp_send_reschedule(cpu);
}
+
+static void resched_cpu(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ if (!spin_trylock_irqsave(&rq->lock, flags))
+ return;
+ resched_task(cpu_curr(cpu));
+ spin_unlock_irqrestore(&rq->lock, flags);
+}
#else
static inline void resched_task(struct task_struct *p)
{
@@ -1241,7 +1278,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
}
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
if (local_group) {
this_load = avg_load;
@@ -1368,7 +1406,16 @@ static int wake_idle(int cpu, struct task_struct *p)
struct sched_domain *sd;
int i;
- if (idle_cpu(cpu))
+ /*
+ * If it is idle, then it is the best cpu to run this task.
+ *
+ * This cpu is also the best, if it has more than one task already.
+ * Siblings must be also busy(in most cases) as they didn't already
+ * pickup the extra load from this cpu and hence we need not check
+ * sibling runqueue info. This will avoid the checks and cache miss
+ * penalities associated with that.
+ */
+ if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
return cpu;
for_each_domain(cpu, sd) {
@@ -2352,12 +2399,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
}
total_load += avg_load;
- total_pwr += group->cpu_power;
+ total_pwr += group->__cpu_power;
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = sg_div_cpu_power(group,
+ avg_load * SCHED_LOAD_SCALE);
- group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
+ group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
if (local_group) {
this_load = avg_load;
@@ -2468,8 +2516,8 @@ group_next:
max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
- *imbalance = min(max_pull * busiest->cpu_power,
- (avg_load - this_load) * this->cpu_power)
+ *imbalance = min(max_pull * busiest->__cpu_power,
+ (avg_load - this_load) * this->__cpu_power)
/ SCHED_LOAD_SCALE;
/*
@@ -2503,28 +2551,29 @@ small_imbalance:
* moving them.
*/
- pwr_now += busiest->cpu_power *
- min(busiest_load_per_task, max_load);
- pwr_now += this->cpu_power *
- min(this_load_per_task, this_load);
+ pwr_now += busiest->__cpu_power *
+ min(busiest_load_per_task, max_load);
+ pwr_now += this->__cpu_power *
+ min(this_load_per_task, this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
- tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
- busiest->cpu_power;
+ tmp = sg_div_cpu_power(busiest,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
if (max_load > tmp)
- pwr_move += busiest->cpu_power *
+ pwr_move += busiest->__cpu_power *
min(busiest_load_per_task, max_load - tmp);
/* Amount of load we'd add */
- if (max_load * busiest->cpu_power <
+ if (max_load * busiest->__cpu_power <
busiest_load_per_task * SCHED_LOAD_SCALE)
- tmp = max_load * busiest->cpu_power / this->cpu_power;
+ tmp = sg_div_cpu_power(this,
+ max_load * busiest->__cpu_power);
else
- tmp = busiest_load_per_task * SCHED_LOAD_SCALE /
- this->cpu_power;
- pwr_move += this->cpu_power *
- min(this_load_per_task, this_load + tmp);
+ tmp = sg_div_cpu_power(this,
+ busiest_load_per_task * SCHED_LOAD_SCALE);
+ pwr_move += this->__cpu_power *
+ min(this_load_per_task, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
@@ -2657,6 +2706,12 @@ redo:
double_rq_unlock(this_rq, busiest);
local_irq_restore(flags);
+ /*
+ * some other cpu did the load balance for us.
+ */
+ if (nr_moved && this_cpu != smp_processor_id())
+ resched_cpu(this_cpu);
+
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned)) {
cpu_clear(cpu_of(busiest), cpus);
@@ -2927,32 +2982,98 @@ static void update_load(struct rq *this_rq)
}
}
+#ifdef CONFIG_NO_HZ
+static struct {
+ atomic_t load_balancer;
+ cpumask_t cpu_mask;
+} nohz ____cacheline_aligned = {
+ .load_balancer = ATOMIC_INIT(-1),
+ .cpu_mask = CPU_MASK_NONE,
+};
+
/*
- * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * This routine will try to nominate the ilb (idle load balancing)
+ * owner among the cpus whose ticks are stopped. ilb owner will do the idle
+ * load balancing on behalf of all those cpus. If all the cpus in the system
+ * go into this tickless mode, then there will be no ilb owner (as there is
+ * no need for one) and all the cpus will sleep till the next wakeup event
+ * arrives...
*
+ * For the ilb owner, tick is not stopped. And this tick will be used
+ * for idle load balancing. ilb owner will still be part of
+ * nohz.cpu_mask..
+ *
+ * While stopping the tick, this cpu will become the ilb owner if there
+ * is no other owner. And will be the owner till that cpu becomes busy
+ * or if all cpus in the system stop their ticks at which point
+ * there is no need for ilb owner.
+ *
+ * When the ilb owner becomes busy, it nominates another owner, during the
+ * next busy scheduler_tick()
+ */
+int select_nohz_load_balancer(int stop_tick)
+{
+ int cpu = smp_processor_id();
+
+ if (stop_tick) {
+ cpu_set(cpu, nohz.cpu_mask);
+ cpu_rq(cpu)->in_nohz_recently = 1;
+
+ /*
+ * If we are going offline and still the leader, give up!
+ */
+ if (cpu_is_offline(cpu) &&
+ atomic_read(&nohz.load_balancer) == cpu) {
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ BUG();
+ return 0;
+ }
+
+ /* time for ilb owner also to sleep */
+ if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ if (atomic_read(&nohz.load_balancer) == cpu)
+ atomic_set(&nohz.load_balancer, -1);
+ return 0;
+ }
+
+ if (atomic_read(&nohz.load_balancer) == -1) {
+ /* make me the ilb owner */
+ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
+ return 1;
+ } else if (atomic_read(&nohz.load_balancer) == cpu)
+ return 1;
+ } else {
+ if (!cpu_isset(cpu, nohz.cpu_mask))
+ return 0;
+
+ cpu_clear(cpu, nohz.cpu_mask);
+
+ if (atomic_read(&nohz.load_balancer) == cpu)
+ if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
+ BUG();
+ }
+ return 0;
+}
+#endif
+
+static DEFINE_SPINLOCK(balancing);
+
+/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
* Balancing parameters are set up in arch_init_sched_domains.
*/
-static DEFINE_SPINLOCK(balancing);
-
-static void run_rebalance_domains(struct softirq_action *h)
+static inline void rebalance_domains(int cpu, enum idle_type idle)
{
- int this_cpu = smp_processor_id(), balance = 1;
- struct rq *this_rq = cpu_rq(this_cpu);
+ int balance = 1;
+ struct rq *rq = cpu_rq(cpu);
unsigned long interval;
struct sched_domain *sd;
- /*
- * We are idle if there are no processes running. This
- * is valid even if we are the idle process (SMT).
- */
- enum idle_type idle = !this_rq->nr_running ?
- SCHED_IDLE : NOT_IDLE;
- /* Earliest time when we have to call run_rebalance_domains again */
+ /* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
- for_each_domain(this_cpu, sd) {
+ for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
@@ -2971,7 +3092,7 @@ static void run_rebalance_domains(struct softirq_action *h)
}
if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(this_cpu, this_rq, sd, idle, &balance)) {
+ if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
@@ -2995,7 +3116,114 @@ out:
if (!balance)
break;
}
- this_rq->next_balance = next_balance;
+ rq->next_balance = next_balance;
+}
+
+/*
+ * run_rebalance_domains is triggered when needed from the scheduler tick.
+ * In CONFIG_NO_HZ case, the idle load balance owner will do the
+ * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ */
+static void run_rebalance_domains(struct softirq_action *h)
+{
+ int local_cpu = smp_processor_id();
+ struct rq *local_rq = cpu_rq(local_cpu);
+ enum idle_type idle = local_rq->idle_at_tick ? SCHED_IDLE : NOT_IDLE;
+
+ rebalance_domains(local_cpu, idle);
+
+#ifdef CONFIG_NO_HZ
+ /*
+ * If this cpu is the owner for idle load balancing, then do the
+ * balancing on behalf of the other idle cpus whose ticks are
+ * stopped.
+ */
+ if (local_rq->idle_at_tick &&
+ atomic_read(&nohz.load_balancer) == local_cpu) {
+ cpumask_t cpus = nohz.cpu_mask;
+ struct rq *rq;
+ int balance_cpu;
+
+ cpu_clear(local_cpu, cpus);
+ for_each_cpu_mask(balance_cpu, cpus) {
+ /*
+ * If this cpu gets work to do, stop the load balancing
+ * work being done for other cpus. Next load
+ * balancing owner will pick it up.
+ */
+ if (need_resched())
+ break;
+
+ rebalance_domains(balance_cpu, SCHED_IDLE);
+
+ rq = cpu_rq(balance_cpu);
+ if (time_after(local_rq->next_balance, rq->next_balance))
+ local_rq->next_balance = rq->next_balance;
+ }
+ }
+#endif
+}
+
+/*
+ * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
+ *
+ * In case of CONFIG_NO_HZ, this is the place where we nominate a new
+ * idle load balancing owner or decide to stop the periodic load balancing,
+ * if the whole system is idle.
+ */
+static inline void trigger_load_balance(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_NO_HZ
+ /*
+ * If we were in the nohz mode recently and busy at the current
+ * scheduler tick, then check if we need to nominate new idle
+ * load balancer.
+ */
+ if (rq->in_nohz_recently && !rq->idle_at_tick) {
+ rq->in_nohz_recently = 0;
+
+ if (atomic_read(&nohz.load_balancer) == cpu) {
+ cpu_clear(cpu, nohz.cpu_mask);
+ atomic_set(&nohz.load_balancer, -1);
+ }
+
+ if (atomic_read(&nohz.load_balancer) == -1) {
+ /*
+ * simple selection for now: Nominate the
+ * first cpu in the nohz list to be the next
+ * ilb owner.
+ *
+ * TBD: Traverse the sched domains and nominate
+ * the nearest cpu in the nohz.cpu_mask.
+ */
+ int ilb = first_cpu(nohz.cpu_mask);
+
+ if (ilb != NR_CPUS)
+ resched_cpu(ilb);
+ }
+ }
+
+ /*
+ * If this cpu is idle and doing idle load balancing for all the
+ * cpus with ticks stopped, is it time for that to stop?
+ */
+ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
+ cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+ resched_cpu(cpu);
+ return;
+ }
+
+ /*
+ * If this cpu is idle and the idle load balancing is done by
+ * someone else, then no need raise the SCHED_SOFTIRQ
+ */
+ if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
+ cpu_isset(cpu, nohz.cpu_mask))
+ return;
+#endif
+ if (time_after_eq(jiffies, rq->next_balance))
+ raise_softirq(SCHED_SOFTIRQ);
}
#else
/*
@@ -3218,16 +3446,17 @@ void scheduler_tick(void)
unsigned long long now = sched_clock();
struct task_struct *p = current;
int cpu = smp_processor_id();
+ int idle_at_tick = idle_cpu(cpu);
struct rq *rq = cpu_rq(cpu);
update_cpu_clock(p, rq, now);
- if (p != rq->idle)
+ if (!idle_at_tick)
task_running_tick(rq, p);
#ifdef CONFIG_SMP
update_load(rq);
- if (time_after_eq(jiffies, rq->next_balance))
- raise_softirq(SCHED_SOFTIRQ);
+ rq->idle_at_tick = idle_at_tick;
+ trigger_load_balance(cpu);
#endif
}
@@ -3847,13 +4076,13 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
struct prio_array *array;
unsigned long flags;
struct rq *rq;
- int oldprio;
+ int delta;
BUG_ON(prio < 0 || prio > MAX_PRIO);
rq = task_rq_lock(p, &flags);
- oldprio = p->prio;
+ delta = prio - p->prio;
array = p->array;
if (array)
dequeue_task(p, array);
@@ -3869,13 +4098,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
enqueue_task(p, array);
/*
* Reschedule if we are currently running on this runqueue and
- * our priority decreased, or if we are not currently running on
- * this runqueue and our priority is higher than the current's
+ * our priority decreased, or if our priority became higher
+ * than the current's.
*/
- if (task_running(rq, p)) {
- if (p->prio > oldprio)
- resched_task(rq->curr);
- } else if (TASK_PREEMPTS_CURR(p, rq))
+ if (TASK_PREEMPTS_CURR(p, rq) ||
+ (delta > 0 && task_running(rq, p)))
resched_task(rq->curr);
}
task_rq_unlock(rq, &flags);
@@ -3923,10 +4150,12 @@ void set_user_nice(struct task_struct *p, long nice)
enqueue_task(p, array);
inc_raw_weighted_load(rq, p);
/*
- * If the task increased its priority or is running and
- * lowered its priority, then reschedule its CPU:
+ * Reschedule if we are currently running on this runqueue and
+ * our priority decreased, or if our priority became higher
+ * than the current's.
*/
- if (delta < 0 || (delta > 0 && task_running(rq, p)))
+ if (TASK_PREEMPTS_CURR(p, rq) ||
+ (delta > 0 && task_running(rq, p)))
resched_task(rq->curr);
}
out_unlock:
@@ -4153,13 +4382,11 @@ recheck:
__activate_task(p, rq);
/*
* Reschedule if we are currently running on this runqueue and
- * our priority decreased, or if we are not currently running on
- * this runqueue and our priority is higher than the current's
+ * our priority decreased, or our priority became higher
+ * than the current's.
*/
- if (task_running(rq, p)) {
- if (p->prio > oldprio)
- resched_task(rq->curr);
- } else if (TASK_PREEMPTS_CURR(p, rq))
+ if (TASK_PREEMPTS_CURR(p, rq) ||
+ (task_running(rq, p) && p->prio > oldprio))
resched_task(rq->curr);
}
__task_rq_unlock(rq);
@@ -4687,32 +4914,10 @@ out_unlock:
return retval;
}
-static inline struct task_struct *eldest_child(struct task_struct *p)
-{
- if (list_empty(&p->children))
- return NULL;
- return list_entry(p->children.next,struct task_struct,sibling);
-}
-
-static inline struct task_struct *older_sibling(struct task_struct *p)
-{
- if (p->sibling.prev==&p->parent->children)
- return NULL;
- return list_entry(p->sibling.prev,struct task_struct,sibling);
-}
-
-static inline struct task_struct *younger_sibling(struct task_struct *p)
-{
- if (p->sibling.next==&p->parent->children)
- return NULL;
- return list_entry(p->sibling.next,struct task_struct,sibling);
-}
-
static const char stat_nam[] = "RSDTtZX";
static void show_task(struct task_struct *p)
{
- struct task_struct *relative;
unsigned long free = 0;
unsigned state;
@@ -4738,19 +4943,7 @@ static void show_task(struct task_struct *p)
free = (unsigned long)n - (unsigned long)end_of_stack(p);
}
#endif
- printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
- if ((relative = eldest_child(p)))
- printk("%5d ", relative->pid);
- else
- printk(" ");
- if ((relative = younger_sibling(p)))
- printk("%7d", relative->pid);
- else
- printk(" ");
- if ((relative = older_sibling(p)))
- printk(" %5d", relative->pid);
- else
- printk(" ");
+ printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
if (!p->mm)
printk(" (L-TLB)\n");
else
@@ -4780,10 +4973,12 @@ void show_state_filter(unsigned long state_filter)
* console might take alot of time:
*/
touch_nmi_watchdog();
- if (p->state & state_filter)
+ if (!state_filter || (p->state & state_filter))
show_task(p);
} while_each_thread(g, p);
+ touch_all_softlockup_watchdogs();
+
read_unlock(&tasklist_lock);
/*
* Only show locks if all tasks are dumped:
@@ -5278,6 +5473,11 @@ int __init migration_init(void)
#endif
#ifdef CONFIG_SMP
+
+/* Number of possible processor ids */
+int nr_cpu_ids __read_mostly = NR_CPUS;
+EXPORT_SYMBOL(nr_cpu_ids);
+
#undef SCHED_DOMAIN_DEBUG
#ifdef SCHED_DOMAIN_DEBUG
static void sched_domain_debug(struct sched_domain *sd, int cpu)
@@ -5333,7 +5533,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
break;
}
- if (!group->cpu_power) {
+ if (!group->__cpu_power) {
printk("\n");
printk(KERN_ERR "ERROR: domain->cpu_power not "
"set\n");
@@ -5510,7 +5710,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
continue;
sg->cpumask = CPU_MASK_NONE;
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
for_each_cpu_mask(j, span) {
if (group_fn(j, cpu_map, NULL) != group)
@@ -6199,7 +6399,7 @@ next_sg:
continue;
}
- sg->cpu_power += sd->groups->cpu_power;
+ sg_inc_cpu_power(sg, sd->groups->__cpu_power);
}
sg = sg->next;
if (sg != group_head)
@@ -6274,6 +6474,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
child = sd->child;
+ sd->groups->__cpu_power = 0;
+
/*
* For perf policy, if the groups in child domain share resources
* (for example cores sharing some portions of the cache hierarchy
@@ -6284,18 +6486,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
(child->flags &
(SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
- sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
return;
}
- sd->groups->cpu_power = 0;
-
/*
* add cpu_power of each child group to this groups cpu_power
*/
group = child->groups;
do {
- sd->groups->cpu_power += group->cpu_power;
+ sg_inc_cpu_power(sd->groups, group->__cpu_power);
group = group->next;
} while (group != child->groups);
}
@@ -6455,7 +6655,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
sd = &per_cpu(node_domains, j);
sd->groups = sg;
}
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
sg->cpumask = nodemask;
sg->next = sg;
cpus_or(covered, covered, nodemask);
@@ -6483,7 +6683,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
"Can not alloc domain group for node %d\n", j);
goto error;
}
- sg->cpu_power = 0;
+ sg->__cpu_power = 0;
sg->cpumask = tmp;
sg->next = prev->next;
cpus_or(covered, covered, tmp);
@@ -6760,6 +6960,7 @@ int in_sched_functions(unsigned long addr)
void __init sched_init(void)
{
int i, j, k;
+ int highest_cpu = 0;
for_each_possible_cpu(i) {
struct prio_array *array;
@@ -6794,11 +6995,13 @@ void __init sched_init(void)
// delimiter for bitsearch
__set_bit(MAX_PRIO, array->bitmap);
}
+ highest_cpu = i;
}
set_load_weight(&init_task);
#ifdef CONFIG_SMP
+ nr_cpu_ids = highest_cpu + 1;
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 3670225ecbc..1368e67c848 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -12,7 +12,6 @@
#include <linux/slab.h>
#include <linux/module.h>
-#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
@@ -2636,9 +2635,5 @@ __attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
void __init signals_init(void)
{
- sigqueue_cachep =
- kmem_cache_create("sigqueue",
- sizeof(struct sigqueue),
- __alignof__(struct sigqueue),
- SLAB_PANIC, NULL, NULL);
+ sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 50afeb81330..8fa7040247a 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -34,12 +34,32 @@ static struct notifier_block panic_block = {
.notifier_call = softlock_panic,
};
+/*
+ * Returns seconds, approximately. We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(void)
+{
+ return sched_clock() >> 30; /* 2^30 ~= 10^9 */
+}
+
void touch_softlockup_watchdog(void)
{
- __raw_get_cpu_var(touch_timestamp) = jiffies;
+ __raw_get_cpu_var(touch_timestamp) = get_timestamp();
}
EXPORT_SYMBOL(touch_softlockup_watchdog);
+void touch_all_softlockup_watchdogs(void)
+{
+ int cpu;
+
+ /* Cause each CPU to re-update its timestamp rather than complain */
+ for_each_online_cpu(cpu)
+ per_cpu(touch_timestamp, cpu) = 0;
+}
+EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
+
/*
* This callback runs from the timer interrupt, and checks
* whether the watchdog thread has hung or not:
@@ -48,9 +68,18 @@ void softlockup_tick(void)
{
int this_cpu = smp_processor_id();
unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu);
+ unsigned long print_timestamp;
+ unsigned long now;
+
+ if (touch_timestamp == 0) {
+ touch_softlockup_watchdog();
+ return;
+ }
+
+ print_timestamp = per_cpu(print_timestamp, this_cpu);
- /* prevent double reports: */
- if (per_cpu(print_timestamp, this_cpu) == touch_timestamp ||
+ /* report at most once a second */
+ if (print_timestamp < (touch_timestamp + 1) ||
did_panic ||
!per_cpu(watchdog_task, this_cpu))
return;
@@ -61,12 +90,14 @@ void softlockup_tick(void)
return;
}
+ now = get_timestamp();
+
/* Wake up the high-prio watchdog task every second: */
- if (time_after(jiffies, touch_timestamp + HZ))
+ if (now > (touch_timestamp + 1))
wake_up_process(per_cpu(watchdog_task, this_cpu));
/* Warn about unreasonable 10+ seconds delays: */
- if (time_after(jiffies, touch_timestamp + 10*HZ)) {
+ if (now > (touch_timestamp + 10)) {
per_cpu(print_timestamp, this_cpu) = touch_timestamp;
spin_lock(&print_lock);
@@ -82,11 +113,14 @@ void softlockup_tick(void)
*/
static int watchdog(void * __bind_cpu)
{
- struct sched_param param = { .sched_priority = 99 };
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
sched_setscheduler(current, SCHED_FIFO, &param);
current->flags |= PF_NOFREEZE;
+ /* initialize timestamp */
+ touch_softlockup_watchdog();
+
/*
* Run briefly once per second to reset the softlockup timestamp.
* If this gets delayed for more than 10 seconds then the
@@ -118,7 +152,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
printk("watchdog for %i failed\n", hotcpu);
return NOTIFY_BAD;
}
- per_cpu(touch_timestamp, hotcpu) = jiffies;
+ per_cpu(touch_timestamp, hotcpu) = 0;
per_cpu(watchdog_task, hotcpu) = p;
kthread_bind(p, hotcpu);
break;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 12458040e66..daabb74ee0b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,11 +1,12 @@
/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
* GPL v2 and any later version.
*/
-#include <linux/stop_machine.h>
-#include <linux/kthread.h>
-#include <linux/sched.h>
#include <linux/cpu.h>
#include <linux/err.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/stop_machine.h>
#include <linux/syscalls.h>
#include <asm/atomic.h>
#include <asm/semaphore.h>
@@ -208,3 +209,4 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
return ret;
}
+EXPORT_SYMBOL_GPL(stop_machine_run);
diff --git a/kernel/sys.c b/kernel/sys.c
index 123b165080e..926bf9d7ac4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -881,7 +881,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
#ifdef CONFIG_SOFTWARE_SUSPEND
case LINUX_REBOOT_CMD_SW_SUSPEND:
{
- int ret = software_suspend();
+ int ret = pm_suspend(PM_SUSPEND_DISK);
unlock_kernel();
return ret;
}
@@ -1923,6 +1923,16 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
if (retval)
return retval;
+ if (resource == RLIMIT_CPU && new_rlim.rlim_cur == 0) {
+ /*
+ * The caller is asking for an immediate RLIMIT_CPU
+ * expiry. But we use the zero value to mean "it was
+ * never set". So let's cheat and make it one second
+ * instead
+ */
+ new_rlim.rlim_cur = 1;
+ }
+
task_lock(current->group_leader);
*old_rlim = new_rlim;
task_unlock(current->group_leader);
@@ -1944,15 +1954,6 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
unsigned long rlim_cur = new_rlim.rlim_cur;
cputime_t cputime;
- if (rlim_cur == 0) {
- /*
- * The caller is asking for an immediate RLIMIT_CPU
- * expiry. But we use the zero value to mean "it was
- * never set". So let's cheat and make it one second
- * instead
- */
- rlim_cur = 1;
- }
cputime = secs_to_cputime(rlim_cur);
read_lock(&tasklist_lock);
spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1b255df4fcd..f0664bd5011 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -76,6 +76,7 @@ extern int pid_max_min, pid_max_max;
extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
extern int compat_log;
+extern int maps_protect;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
@@ -603,6 +604,16 @@ static ctl_table kern_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
+#ifdef CONFIG_PROC_FS
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "maps_protect",
+ .data = &maps_protect,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
{ .ctl_name = 0 }
};
@@ -1676,7 +1687,7 @@ static int proc_dointvec_taint(ctl_table *table, int write, struct file *filp,
{
int op;
- if (!capable(CAP_SYS_ADMIN))
+ if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
op = OP_OR;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4c3476fa058..906cae77158 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -102,7 +102,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
*/
static int send_reply(struct sk_buff *skb, pid_t pid)
{
- struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+ struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
void *reply = genlmsg_data(genlhdr);
int rc;
@@ -121,7 +121,7 @@ static int send_reply(struct sk_buff *skb, pid_t pid)
static void send_cpu_listeners(struct sk_buff *skb,
struct listener_list *listeners)
{
- struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+ struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb));
struct listener *s, *tmp;
struct sk_buff *skb_next, *skb_cur = skb;
void *reply = genlmsg_data(genlhdr);
@@ -524,9 +524,7 @@ void __init taskstats_init_early(void)
{
unsigned int i;
- taskstats_cache = kmem_cache_create("taskstats_cache",
- sizeof(struct taskstats),
- 0, SLAB_PANIC, NULL, NULL);
+ taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC);
for_each_possible_cpu(i) {
INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
init_rwsem(&(per_cpu(listener_array, i).sem));
diff --git a/kernel/time.c b/kernel/time.c
index c6c80ea5d0e..f04791f6940 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -31,7 +31,6 @@
#include <linux/timex.h>
#include <linux/capability.h>
#include <linux/errno.h>
-#include <linux/smp_lock.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/fs.h>
@@ -247,6 +246,36 @@ struct timespec current_fs_time(struct super_block *sb)
}
EXPORT_SYMBOL(current_fs_time);
+/*
+ * Convert jiffies to milliseconds and back.
+ *
+ * Avoid unnecessary multiplications/divisions in the
+ * two most common HZ cases:
+ */
+unsigned int inline jiffies_to_msecs(const unsigned long j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+ return (MSEC_PER_SEC / HZ) * j;
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+ return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
+#else
+ return (j * MSEC_PER_SEC) / HZ;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_msecs);
+
+unsigned int inline jiffies_to_usecs(const unsigned long j)
+{
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+ return (USEC_PER_SEC / HZ) * j;
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+ return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
+#else
+ return (j * USEC_PER_SEC) / HZ;
+#endif
+}
+EXPORT_SYMBOL(jiffies_to_usecs);
+
/**
* timespec_trunc - Truncate timespec to a granularity
* @t: Timespec
@@ -452,6 +481,7 @@ struct timespec ns_to_timespec(const s64 nsec)
return ts;
}
+EXPORT_SYMBOL(ns_to_timespec);
/**
* ns_to_timeval - Convert nanoseconds to timeval
@@ -469,36 +499,7 @@ struct timeval ns_to_timeval(const s64 nsec)
return tv;
}
-
-/*
- * Convert jiffies to milliseconds and back.
- *
- * Avoid unnecessary multiplications/divisions in the
- * two most common HZ cases:
- */
-unsigned int jiffies_to_msecs(const unsigned long j)
-{
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
- return (MSEC_PER_SEC / HZ) * j;
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
- return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC);
-#else
- return (j * MSEC_PER_SEC) / HZ;
-#endif
-}
-EXPORT_SYMBOL(jiffies_to_msecs);
-
-unsigned int jiffies_to_usecs(const unsigned long j)
-{
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
- return (USEC_PER_SEC / HZ) * j;
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
- return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
-#else
- return (j * USEC_PER_SEC) / HZ;
-#endif
-}
-EXPORT_SYMBOL(jiffies_to_usecs);
+EXPORT_SYMBOL(ns_to_timeval);
/*
* When we convert to jiffies then we interpret incoming values
@@ -635,6 +636,7 @@ timeval_to_jiffies(const struct timeval *value)
(((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
(USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
}
+EXPORT_SYMBOL(timeval_to_jiffies);
void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
{
@@ -649,6 +651,7 @@ void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value)
tv_usec /= NSEC_PER_USEC;
value->tv_usec = tv_usec;
}
+EXPORT_SYMBOL(jiffies_to_timeval);
/*
* Convert jiffies/jiffies_64 to clock_t and back.
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 93bccba1f26..99b6034fc86 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
-obj-y += ntp.o clocksource.o jiffies.o timer_list.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 67932ea78c1..76212b2a99d 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -274,72 +274,3 @@ void clockevents_notify(unsigned long reason, void *arg)
}
EXPORT_SYMBOL_GPL(clockevents_notify);
-#ifdef CONFIG_SYSFS
-
-/**
- * clockevents_show_registered - sysfs interface for listing clockevents
- * @dev: unused
- * @buf: char buffer to be filled with clock events list
- *
- * Provides sysfs interface for listing registered clock event devices
- */
-static ssize_t clockevents_show_registered(struct sys_device *dev, char *buf)
-{
- struct list_head *tmp;
- char *p = buf;
- int cpu;
-
- spin_lock(&clockevents_lock);
-
- list_for_each(tmp, &clockevent_devices) {
- struct clock_event_device *ce;
-
- ce = list_entry(tmp, struct clock_event_device, list);
- p += sprintf(p, "%-20s F:%04x M:%d", ce->name,
- ce->features, ce->mode);
- p += sprintf(p, " C:");
- if (!cpus_equal(ce->cpumask, cpu_possible_map)) {
- for_each_cpu_mask(cpu, ce->cpumask)
- p += sprintf(p, " %d", cpu);
- } else {
- /*
- * FIXME: Add the cpu which is handling this sucker
- */
- }
- p += sprintf(p, "\n");
- }
-
- spin_unlock(&clockevents_lock);
-
- return p - buf;
-}
-
-/*
- * Sysfs setup bits:
- */
-static SYSDEV_ATTR(registered, 0600,
- clockevents_show_registered, NULL);
-
-static struct sysdev_class clockevents_sysclass = {
- set_kset_name("clockevents"),
-};
-
-static struct sys_device clockevents_sys_device = {
- .id = 0,
- .cls = &clockevents_sysclass,
-};
-
-static int __init clockevents_sysfs_init(void)
-{
- int error = sysdev_class_register(&clockevents_sysclass);
-
- if (!error)
- error = sysdev_register(&clockevents_sys_device);
- if (!error)
- error = sysdev_create_file(
- &clockevents_sys_device,
- &attr_registered);
- return error;
-}
-device_initcall(clockevents_sysfs_init);
-#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 5b0e46b56fd..fe5c7db2424 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -151,7 +151,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
add_timer(&watchdog_timer);
}
- } else if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) {
+ } else {
+ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
if (!watchdog || cs->rating > watchdog->rating) {
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 3be8da8fed7..4c256fdb887 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -69,4 +69,4 @@ static int __init init_jiffies_clocksource(void)
return clocksource_register(&clocksource_jiffies);
}
-module_init(init_jiffies_clocksource);
+core_initcall(init_jiffies_clocksource);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index eb12509e00b..cb25649c6f5 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -32,7 +32,7 @@ static u64 tick_length, tick_length_base;
/* TIME_ERROR prevents overwriting the CMOS clock */
static int time_state = TIME_OK; /* clock synchronization status */
int time_status = STA_UNSYNC; /* clock status bits */
-static long time_offset; /* time adjustment (ns) */
+static s64 time_offset; /* time adjustment (ns) */
static long time_constant = 2; /* pll time constant */
long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
@@ -196,7 +196,7 @@ void __attribute__ ((weak)) notify_arch_cmos_timer(void)
*/
int do_adjtimex(struct timex *txc)
{
- long ltemp, mtemp, save_adjust;
+ long mtemp, save_adjust, rem;
s64 freq_adj, temp64;
int result;
@@ -277,14 +277,14 @@ int do_adjtimex(struct timex *txc)
time_adjust = txc->offset;
}
else if (time_status & STA_PLL) {
- ltemp = txc->offset * NSEC_PER_USEC;
+ time_offset = txc->offset * NSEC_PER_USEC;
/*
* Scale the phase adjustment and
* clamp to the operating range.
*/
- time_offset = min(ltemp, MAXPHASE * NSEC_PER_USEC);
- time_offset = max(time_offset, -MAXPHASE * NSEC_PER_USEC);
+ time_offset = min(time_offset, (s64)MAXPHASE * NSEC_PER_USEC);
+ time_offset = max(time_offset, (s64)-MAXPHASE * NSEC_PER_USEC);
/*
* Select whether the frequency is to be controlled
@@ -297,11 +297,11 @@ int do_adjtimex(struct timex *txc)
mtemp = xtime.tv_sec - time_reftime;
time_reftime = xtime.tv_sec;
- freq_adj = (s64)time_offset * mtemp;
+ freq_adj = time_offset * mtemp;
freq_adj = shift_right(freq_adj, time_constant * 2 +
(SHIFT_PLL + 2) * 2 - SHIFT_NSEC);
if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
- temp64 = (s64)time_offset << (SHIFT_NSEC - SHIFT_FLL);
+ temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL);
if (time_offset < 0) {
temp64 = -temp64;
do_div(temp64, mtemp);
@@ -314,8 +314,10 @@ int do_adjtimex(struct timex *txc)
freq_adj += time_freq;
freq_adj = min(freq_adj, (s64)MAXFREQ_NSEC);
time_freq = max(freq_adj, (s64)-MAXFREQ_NSEC);
- time_offset = (time_offset / NTP_INTERVAL_FREQ)
- << SHIFT_UPDATE;
+ time_offset = div_long_long_rem_signed(time_offset,
+ NTP_INTERVAL_FREQ,
+ &rem);
+ time_offset <<= SHIFT_UPDATE;
} /* STA_PLL */
} /* txc->modes & ADJ_OFFSET */
if (txc->modes & ADJ_TICK)
@@ -328,12 +330,12 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
result = TIME_ERROR;
if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
- txc->offset = save_adjust;
+ txc->offset = save_adjust;
else
- txc->offset = shift_right(time_offset, SHIFT_UPDATE)
- * NTP_INTERVAL_FREQ / 1000;
- txc->freq = (time_freq / NSEC_PER_USEC)
- << (SHIFT_USEC - SHIFT_NSEC);
+ txc->offset = ((long)shift_right(time_offset, SHIFT_UPDATE)) *
+ NTP_INTERVAL_FREQ / 1000;
+ txc->freq = (time_freq / NSEC_PER_USEC) <<
+ (SHIFT_USEC - SHIFT_NSEC);
txc->maxerror = time_maxerror;
txc->esterror = time_esterror;
txc->status = time_status;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 5567745470f..eadfce2fff7 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -307,12 +307,19 @@ int tick_resume_broadcast(void)
spin_lock_irqsave(&tick_broadcast_lock, flags);
bc = tick_broadcast_device.evtdev;
- if (bc) {
- if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC &&
- !cpus_empty(tick_broadcast_mask))
- tick_broadcast_start_periodic(bc);
- broadcast = cpu_isset(smp_processor_id(), tick_broadcast_mask);
+ if (bc) {
+ switch (tick_broadcast_device.mode) {
+ case TICKDEV_MODE_PERIODIC:
+ if(!cpus_empty(tick_broadcast_mask))
+ tick_broadcast_start_periodic(bc);
+ broadcast = cpu_isset(smp_processor_id(),
+ tick_broadcast_mask);
+ break;
+ case TICKDEV_MODE_ONESHOT:
+ broadcast = tick_resume_broadcast_oneshot(bc);
+ break;
+ }
}
spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -347,6 +354,16 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
}
}
+int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
+{
+ clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
+
+ if(!cpus_empty(tick_broadcast_oneshot_mask))
+ tick_broadcast_set_event(ktime_get(), 1);
+
+ return cpu_isset(smp_processor_id(), tick_broadcast_oneshot_mask);
+}
+
/*
* Reprogram the broadcast device:
*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 43ba1bdec14..a96ec9ab345 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -31,7 +31,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
*/
ktime_t tick_next_period;
ktime_t tick_period;
-static int tick_do_timer_cpu = -1;
+int tick_do_timer_cpu __read_mostly = -1;
DEFINE_SPINLOCK(tick_device_lock);
/*
@@ -295,21 +295,26 @@ static void tick_shutdown(unsigned int *cpup)
clockevents_exchange_device(dev, NULL);
td->evtdev = NULL;
}
+ /* Transfer the do_timer job away from this cpu */
+ if (*cpup == tick_do_timer_cpu) {
+ int cpu = first_cpu(cpu_online_map);
+
+ tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1;
+ }
spin_unlock_irqrestore(&tick_device_lock, flags);
}
-static void tick_suspend_periodic(void)
+static void tick_suspend(void)
{
struct tick_device *td = &__get_cpu_var(tick_cpu_device);
unsigned long flags;
spin_lock_irqsave(&tick_device_lock, flags);
- if (td->mode == TICKDEV_MODE_PERIODIC)
- clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
+ clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
spin_unlock_irqrestore(&tick_device_lock, flags);
}
-static void tick_resume_periodic(void)
+static void tick_resume(void)
{
struct tick_device *td = &__get_cpu_var(tick_cpu_device);
unsigned long flags;
@@ -317,6 +322,8 @@ static void tick_resume_periodic(void)
spin_lock_irqsave(&tick_device_lock, flags);
if (td->mode == TICKDEV_MODE_PERIODIC)
tick_setup_periodic(td->evtdev, 0);
+ else
+ tick_resume_oneshot();
spin_unlock_irqrestore(&tick_device_lock, flags);
}
@@ -348,13 +355,13 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
break;
case CLOCK_EVT_NOTIFY_SUSPEND:
- tick_suspend_periodic();
+ tick_suspend();
tick_suspend_broadcast();
break;
case CLOCK_EVT_NOTIFY_RESUME:
if (!tick_resume_broadcast())
- tick_resume_periodic();
+ tick_resume();
break;
default:
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 75890efd24f..bb13f272490 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -5,6 +5,7 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
extern spinlock_t tick_device_lock;
extern ktime_t tick_next_period;
extern ktime_t tick_period;
+extern int tick_do_timer_cpu __read_mostly;
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
@@ -19,12 +20,13 @@ extern void tick_setup_oneshot(struct clock_event_device *newdev,
extern int tick_program_event(ktime_t expires, int force);
extern void tick_oneshot_notify(void);
extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
-
+extern void tick_resume_oneshot(void);
# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc);
extern void tick_broadcast_oneshot_control(unsigned long reason);
extern void tick_broadcast_switch_to_oneshot(void);
extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
+extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
# else /* BROADCAST */
static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
@@ -43,6 +45,10 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
{
BUG();
}
+static inline void tick_resume_oneshot(void)
+{
+ BUG();
+}
static inline int tick_program_event(ktime_t expires, int force)
{
return 0;
@@ -54,6 +60,10 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
}
static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
+{
+ return 0;
+}
#endif /* !TICK_ONESHOT */
/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2e8b7ff863c..f6997ab0c3c 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -41,6 +41,18 @@ int tick_program_event(ktime_t expires, int force)
}
/**
+ * tick_resume_onshot - resume oneshot mode
+ */
+void tick_resume_oneshot(void)
+{
+ struct tick_device *td = &__get_cpu_var(tick_cpu_device);
+ struct clock_event_device *dev = td->evtdev;
+
+ clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
+ tick_program_event(ktime_get(), 1);
+}
+
+/**
* tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz)
*/
void tick_setup_oneshot(struct clock_event_device *newdev,
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 51556b95f60..3483e6cb954 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -217,10 +217,30 @@ void tick_nohz_stop_sched_tick(void)
* the scheduler tick in nohz_restart_sched_tick.
*/
if (!ts->tick_stopped) {
+ if (select_nohz_load_balancer(1)) {
+ /*
+ * sched tick not stopped!
+ */
+ cpu_clear(cpu, nohz_cpu_mask);
+ goto out;
+ }
+
ts->idle_tick = ts->sched_timer.expires;
ts->tick_stopped = 1;
ts->idle_jiffies = last_jiffies;
}
+
+ /*
+ * If this cpu is the one which updates jiffies, then
+ * give up the assignment and let it be taken by the
+ * cpu which runs the tick timer next, which might be
+ * this cpu as well. If we don't drop this here the
+ * jiffies might be stale and do_timer() never
+ * invoked.
+ */
+ if (cpu == tick_do_timer_cpu)
+ tick_do_timer_cpu = -1;
+
/*
* calculate the expiry time for the next timer wheel
* timer
@@ -273,6 +293,7 @@ void tick_nohz_restart_sched_tick(void)
now = ktime_get();
local_irq_disable();
+ select_nohz_load_balancer(0);
tick_do_update_jiffies64(now);
cpu_clear(cpu, nohz_cpu_mask);
@@ -338,12 +359,24 @@ static void tick_nohz_handler(struct clock_event_device *dev)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
struct pt_regs *regs = get_irq_regs();
+ int cpu = smp_processor_id();
ktime_t now = ktime_get();
dev->next_event.tv64 = KTIME_MAX;
+ /*
+ * Check if the do_timer duty was dropped. We don't care about
+ * concurrency: This happens only when the cpu in charge went
+ * into a long sleep. If two cpus happen to assign themself to
+ * this duty, then the jiffies update is still serialized by
+ * xtime_lock.
+ */
+ if (unlikely(tick_do_timer_cpu == -1))
+ tick_do_timer_cpu = cpu;
+
/* Check, if the jiffies need an update */
- tick_do_update_jiffies64(now);
+ if (tick_do_timer_cpu == cpu)
+ tick_do_update_jiffies64(now);
/*
* When we are idle and the tick is stopped, we have to touch
@@ -431,9 +464,23 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
struct hrtimer_cpu_base *base = timer->base->cpu_base;
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
+ int cpu = smp_processor_id();
+
+#ifdef CONFIG_NO_HZ
+ /*
+ * Check if the do_timer duty was dropped. We don't care about
+ * concurrency: This happens only when the cpu in charge went
+ * into a long sleep. If two cpus happen to assign themself to
+ * this duty, then the jiffies update is still serialized by
+ * xtime_lock.
+ */
+ if (unlikely(tick_do_timer_cpu == -1))
+ tick_do_timer_cpu = cpu;
+#endif
/* Check, if the jiffies need an update */
- tick_do_update_jiffies64(now);
+ if (tick_do_timer_cpu == cpu)
+ tick_do_update_jiffies64(now);
/*
* Do not call, when we are not in irq context and have
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
new file mode 100644
index 00000000000..f9217bf644f
--- /dev/null
+++ b/kernel/time/timekeeping.c
@@ -0,0 +1,476 @@
+/*
+ * linux/kernel/time/timekeeping.c
+ *
+ * Kernel timekeeping code and accessor functions
+ *
+ * This code was moved from linux/kernel/timer.c.
+ * Please see that file for copyright and history logs.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/sysdev.h>
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/time.h>
+#include <linux/tick.h>
+
+
+/*
+ * This read-write spinlock protects us from races in SMP while
+ * playing with xtime and avenrun.
+ */
+__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
+
+EXPORT_SYMBOL(xtime_lock);
+
+
+/*
+ * The current time
+ * wall_to_monotonic is what we need to add to xtime (or xtime corrected
+ * for sub jiffie times) to get to monotonic time. Monotonic is pegged
+ * at zero at system boot time, so wall_to_monotonic will be negative,
+ * however, we will ALWAYS keep the tv_nsec part positive so we can use
+ * the usual normalization.
+ */
+struct timespec xtime __attribute__ ((aligned (16)));
+struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
+
+EXPORT_SYMBOL(xtime);
+
+
+static struct clocksource *clock; /* pointer to current clocksource */
+
+
+#ifdef CONFIG_GENERIC_TIME
+/**
+ * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
+ *
+ * private function, must hold xtime_lock lock when being
+ * called. Returns the number of nanoseconds since the
+ * last call to update_wall_time() (adjusted by NTP scaling)
+ */
+static inline s64 __get_nsec_offset(void)
+{
+ cycle_t cycle_now, cycle_delta;
+ s64 ns_offset;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ /* convert to nanoseconds: */
+ ns_offset = cyc2ns(clock, cycle_delta);
+
+ return ns_offset;
+}
+
+/**
+ * __get_realtime_clock_ts - Returns the time of day in a timespec
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec. Used by
+ * do_gettimeofday() and get_realtime_clock_ts().
+ */
+static inline void __get_realtime_clock_ts(struct timespec *ts)
+{
+ unsigned long seq;
+ s64 nsecs;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ *ts = xtime;
+ nsecs = __get_nsec_offset();
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ timespec_add_ns(ts, nsecs);
+}
+
+/**
+ * getnstimeofday - Returns the time of day in a timespec
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void getnstimeofday(struct timespec *ts)
+{
+ __get_realtime_clock_ts(ts);
+}
+
+EXPORT_SYMBOL(getnstimeofday);
+
+/**
+ * do_gettimeofday - Returns the time of day in a timeval
+ * @tv: pointer to the timeval to be set
+ *
+ * NOTE: Users should be converted to using get_realtime_clock_ts()
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+ struct timespec now;
+
+ __get_realtime_clock_ts(&now);
+ tv->tv_sec = now.tv_sec;
+ tv->tv_usec = now.tv_nsec/1000;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+/**
+ * do_settimeofday - Sets the time of day
+ * @tv: pointer to the timespec variable containing the new time
+ *
+ * Sets the time of day to the new time and update NTP and notify hrtimers
+ */
+int do_settimeofday(struct timespec *tv)
+{
+ unsigned long flags;
+ time_t wtm_sec, sec = tv->tv_sec;
+ long wtm_nsec, nsec = tv->tv_nsec;
+
+ if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+ return -EINVAL;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ nsec -= __get_nsec_offset();
+
+ wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+ wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+
+ set_normalized_timespec(&xtime, sec, nsec);
+ set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+
+ clock->error = 0;
+ ntp_clear();
+
+ update_vsyscall(&xtime, clock);
+
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ /* signal hrtimers about time change */
+ clock_was_set();
+
+ return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+/**
+ * change_clocksource - Swaps clocksources if a new one is available
+ *
+ * Accumulates current time interval and initializes new clocksource
+ */
+static void change_clocksource(void)
+{
+ struct clocksource *new;
+ cycle_t now;
+ u64 nsec;
+
+ new = clocksource_get_next();
+
+ if (clock == new)
+ return;
+
+ now = clocksource_read(new);
+ nsec = __get_nsec_offset();
+ timespec_add_ns(&xtime, nsec);
+
+ clock = new;
+ clock->cycle_last = now;
+
+ clock->error = 0;
+ clock->xtime_nsec = 0;
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+
+ tick_clock_notify();
+
+ printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+ clock->name);
+}
+#else
+static inline void change_clocksource(void) { }
+#endif
+
+/**
+ * timekeeping_is_continuous - check to see if timekeeping is free running
+ */
+int timekeeping_is_continuous(void)
+{
+ unsigned long seq;
+ int ret;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+
+ ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ return ret;
+}
+
+/**
+ * read_persistent_clock - Return time in seconds from the persistent clock.
+ *
+ * Weak dummy function for arches that do not yet support it.
+ * Returns seconds from epoch using the battery backed persistent clock.
+ * Returns zero if unsupported.
+ *
+ * XXX - Do be sure to remove it once all arches implement it.
+ */
+unsigned long __attribute__((weak)) read_persistent_clock(void)
+{
+ return 0;
+}
+
+/*
+ * timekeeping_init - Initializes the clocksource and common timekeeping values
+ */
+void __init timekeeping_init(void)
+{
+ unsigned long flags;
+ unsigned long sec = read_persistent_clock();
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ ntp_clear();
+
+ clock = clocksource_get_next();
+ clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
+ clock->cycle_last = clocksource_read(clock);
+
+ xtime.tv_sec = sec;
+ xtime.tv_nsec = 0;
+ set_normalized_timespec(&wall_to_monotonic,
+ -xtime.tv_sec, -xtime.tv_nsec);
+
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+}
+
+/* flag for if timekeeping is suspended */
+static int timekeeping_suspended;
+/* time in seconds when suspend began */
+static unsigned long timekeeping_suspend_time;
+
+/**
+ * timekeeping_resume - Resumes the generic timekeeping subsystem.
+ * @dev: unused
+ *
+ * This is for the generic clocksource timekeeping.
+ * xtime/wall_to_monotonic/jiffies/etc are
+ * still managed by arch specific suspend/resume code.
+ */
+static int timekeeping_resume(struct sys_device *dev)
+{
+ unsigned long flags;
+ unsigned long now = read_persistent_clock();
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ if (now && (now > timekeeping_suspend_time)) {
+ unsigned long sleep_length = now - timekeeping_suspend_time;
+
+ xtime.tv_sec += sleep_length;
+ wall_to_monotonic.tv_sec -= sleep_length;
+ }
+ /* re-base the last cycle value */
+ clock->cycle_last = clocksource_read(clock);
+ clock->error = 0;
+ timekeeping_suspended = 0;
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ touch_softlockup_watchdog();
+
+ clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
+
+ /* Resume hrtimers */
+ hres_timers_resume();
+
+ return 0;
+}
+
+static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+ timekeeping_suspended = 1;
+ timekeeping_suspend_time = read_persistent_clock();
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+
+ return 0;
+}
+
+/* sysfs resume/suspend bits for timekeeping */
+static struct sysdev_class timekeeping_sysclass = {
+ .resume = timekeeping_resume,
+ .suspend = timekeeping_suspend,
+ set_kset_name("timekeeping"),
+};
+
+static struct sys_device device_timer = {
+ .id = 0,
+ .cls = &timekeeping_sysclass,
+};
+
+static int __init timekeeping_init_device(void)
+{
+ int error = sysdev_class_register(&timekeeping_sysclass);
+ if (!error)
+ error = sysdev_register(&device_timer);
+ return error;
+}
+
+device_initcall(timekeeping_init_device);
+
+/*
+ * If the error is already larger, we look ahead even further
+ * to compensate for late or lost adjustments.
+ */
+static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
+ s64 *offset)
+{
+ s64 tick_error, i;
+ u32 look_ahead, adj;
+ s32 error2, mult;
+
+ /*
+ * Use the current error value to determine how much to look ahead.
+ * The larger the error the slower we adjust for it to avoid problems
+ * with losing too many ticks, otherwise we would overadjust and
+ * produce an even larger error. The smaller the adjustment the
+ * faster we try to adjust for it, as lost ticks can do less harm
+ * here. This is tuned so that an error of about 1 msec is adusted
+ * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
+ */
+ error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
+ error2 = abs(error2);
+ for (look_ahead = 0; error2 > 0; look_ahead++)
+ error2 >>= 2;
+
+ /*
+ * Now calculate the error in (1 << look_ahead) ticks, but first
+ * remove the single look ahead already included in the error.
+ */
+ tick_error = current_tick_length() >>
+ (TICK_LENGTH_SHIFT - clock->shift + 1);
+ tick_error -= clock->xtime_interval >> 1;
+ error = ((error - tick_error) >> look_ahead) + tick_error;
+
+ /* Finally calculate the adjustment shift value. */
+ i = *interval;
+ mult = 1;
+ if (error < 0) {
+ error = -error;
+ *interval = -*interval;
+ *offset = -*offset;
+ mult = -1;
+ }
+ for (adj = 0; error > i; adj++)
+ error >>= 1;
+
+ *interval <<= adj;
+ *offset <<= adj;
+ return mult << adj;
+}
+
+/*
+ * Adjust the multiplier to reduce the error value,
+ * this is optimized for the most common adjustments of -1,0,1,
+ * for other values we can do a bit more work.
+ */
+static void clocksource_adjust(struct clocksource *clock, s64 offset)
+{
+ s64 error, interval = clock->cycle_interval;
+ int adj;
+
+ error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
+ if (error > interval) {
+ error >>= 2;
+ if (likely(error <= interval))
+ adj = 1;
+ else
+ adj = clocksource_bigadjust(error, &interval, &offset);
+ } else if (error < -interval) {
+ error >>= 2;
+ if (likely(error >= -interval)) {
+ adj = -1;
+ interval = -interval;
+ offset = -offset;
+ } else
+ adj = clocksource_bigadjust(error, &interval, &offset);
+ } else
+ return;
+
+ clock->mult += adj;
+ clock->xtime_interval += interval;
+ clock->xtime_nsec -= offset;
+ clock->error -= (interval - offset) <<
+ (TICK_LENGTH_SHIFT - clock->shift);
+}
+
+/**
+ * update_wall_time - Uses the current clocksource to increment the wall time
+ *
+ * Called from the timer interrupt, must hold a write on xtime_lock.
+ */
+void update_wall_time(void)
+{
+ cycle_t offset;
+
+ /* Make sure we're fully resumed: */
+ if (unlikely(timekeeping_suspended))
+ return;
+
+#ifdef CONFIG_GENERIC_TIME
+ offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
+#else
+ offset = clock->cycle_interval;
+#endif
+ clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+
+ /* normally this loop will run just once, however in the
+ * case of lost or late ticks, it will accumulate correctly.
+ */
+ while (offset >= clock->cycle_interval) {
+ /* accumulate one interval */
+ clock->xtime_nsec += clock->xtime_interval;
+ clock->cycle_last += clock->cycle_interval;
+ offset -= clock->cycle_interval;
+
+ if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
+ clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
+ xtime.tv_sec++;
+ second_overflow();
+ }
+
+ /* interpolator bits */
+ time_interpolator_update(clock->xtime_interval
+ >> clock->shift);
+
+ /* accumulate error between NTP and clock interval */
+ clock->error += current_tick_length();
+ clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
+ }
+
+ /* correct the clock when NTP error is too big */
+ clocksource_adjust(clock, offset);
+
+ /* store full nanoseconds into xtime */
+ xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
+ clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
+
+ /* check to see if there is a new clocksource to use */
+ change_clocksource();
+ update_vsyscall(&xtime, clock);
+}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index f82c635c3d5..b734ca4bc75 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -38,17 +38,12 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
static void print_name_offset(struct seq_file *m, void *sym)
{
- unsigned long addr = (unsigned long)sym;
- char namebuf[KSYM_NAME_LEN+1];
- unsigned long size, offset;
- const char *sym_name;
- char *modname;
-
- sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
- if (sym_name)
- SEQ_printf(m, "%s", sym_name);
- else
+ char symname[KSYM_NAME_LEN+1];
+
+ if (lookup_symbol_name((unsigned long)sym, symname) < 0)
SEQ_printf(m, "<%p>", sym);
+ else
+ SEQ_printf(m, "%s", symname);
}
static void
@@ -194,9 +189,9 @@ print_tickdevice(struct seq_file *m, struct tick_device *td)
return;
}
SEQ_printf(m, "%s\n", dev->name);
- SEQ_printf(m, " max_delta_ns: %ld\n", dev->max_delta_ns);
- SEQ_printf(m, " min_delta_ns: %ld\n", dev->min_delta_ns);
- SEQ_printf(m, " mult: %ld\n", dev->mult);
+ SEQ_printf(m, " max_delta_ns: %lu\n", dev->max_delta_ns);
+ SEQ_printf(m, " min_delta_ns: %lu\n", dev->min_delta_ns);
+ SEQ_printf(m, " mult: %lu\n", dev->mult);
SEQ_printf(m, " shift: %d\n", dev->shift);
SEQ_printf(m, " mode: %d\n", dev->mode);
SEQ_printf(m, " next_event: %Ld nsecs\n",
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1bc4882e28e..868f1bceb07 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -257,16 +257,12 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
static void print_name_offset(struct seq_file *m, unsigned long addr)
{
- char namebuf[KSYM_NAME_LEN+1];
- unsigned long size, offset;
- const char *sym_name;
- char *modname;
-
- sym_name = kallsyms_lookup(addr, &size, &offset, &modname, namebuf);
- if (sym_name)
- seq_printf(m, "%s", sym_name);
- else
+ char symname[KSYM_NAME_LEN+1];
+
+ if (lookup_symbol_name(addr, symname) < 0)
seq_printf(m, "<%p>", (void *)addr);
+ else
+ seq_printf(m, "%s", symname);
}
static int tstats_show(struct seq_file *m, void *v)
diff --git a/kernel/timer.c b/kernel/timer.c
index 797cccb8643..7a6448340f9 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1,7 +1,7 @@
/*
* linux/kernel/timer.c
*
- * Kernel internal timers, kernel timekeeping, basic process system calls
+ * Kernel internal timers, basic process system calls
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
@@ -74,7 +74,7 @@ struct tvec_t_base_s {
tvec_t tv3;
tvec_t tv4;
tvec_t tv5;
-} ____cacheline_aligned_in_smp;
+} ____cacheline_aligned;
typedef struct tvec_t_base_s tvec_base_t;
@@ -82,6 +82,37 @@ tvec_base_t boot_tvec_bases;
EXPORT_SYMBOL(boot_tvec_bases);
static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
+/*
+ * Note that all tvec_bases is 2 byte aligned and lower bit of
+ * base in timer_list is guaranteed to be zero. Use the LSB for
+ * the new flag to indicate whether the timer is deferrable
+ */
+#define TBASE_DEFERRABLE_FLAG (0x1)
+
+/* Functions below help us manage 'deferrable' flag */
+static inline unsigned int tbase_get_deferrable(tvec_base_t *base)
+{
+ return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
+}
+
+static inline tvec_base_t *tbase_get_base(tvec_base_t *base)
+{
+ return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
+}
+
+static inline void timer_set_deferrable(struct timer_list *timer)
+{
+ timer->base = ((tvec_base_t *)((unsigned long)(timer->base) |
+ TBASE_DEFERRABLE_FLAG));
+}
+
+static inline void
+timer_set_base(struct timer_list *timer, tvec_base_t *new_base)
+{
+ timer->base = (tvec_base_t *)((unsigned long)(new_base) |
+ tbase_get_deferrable(timer->base));
+}
+
/**
* __round_jiffies - function to round jiffies to a full second
* @j: the time in (absolute) jiffies that should be rounded
@@ -295,6 +326,13 @@ void fastcall init_timer(struct timer_list *timer)
}
EXPORT_SYMBOL(init_timer);
+void fastcall init_timer_deferrable(struct timer_list *timer)
+{
+ init_timer(timer);
+ timer_set_deferrable(timer);
+}
+EXPORT_SYMBOL(init_timer_deferrable);
+
static inline void detach_timer(struct timer_list *timer,
int clear_pending)
{
@@ -325,10 +363,11 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer,
tvec_base_t *base;
for (;;) {
- base = timer->base;
+ tvec_base_t *prelock_base = timer->base;
+ base = tbase_get_base(prelock_base);
if (likely(base != NULL)) {
spin_lock_irqsave(&base->lock, *flags);
- if (likely(base == timer->base))
+ if (likely(prelock_base == timer->base))
return base;
/* The timer has migrated to another CPU */
spin_unlock_irqrestore(&base->lock, *flags);
@@ -365,11 +404,11 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
*/
if (likely(base->running_timer != timer)) {
/* See the comment in lock_timer_base() */
- timer->base = NULL;
+ timer_set_base(timer, NULL);
spin_unlock(&base->lock);
base = new_base;
spin_lock(&base->lock);
- timer->base = base;
+ timer_set_base(timer, base);
}
}
@@ -397,7 +436,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
timer_stats_timer_set_start_info(timer);
BUG_ON(timer_pending(timer) || !timer->function);
spin_lock_irqsave(&base->lock, flags);
- timer->base = base;
+ timer_set_base(timer, base);
internal_add_timer(base, timer);
spin_unlock_irqrestore(&base->lock, flags);
}
@@ -505,6 +544,8 @@ out:
return ret;
}
+EXPORT_SYMBOL(try_to_del_timer_sync);
+
/**
* del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
@@ -548,7 +589,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
* don't have to detach them individually.
*/
list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
- BUG_ON(timer->base != base);
+ BUG_ON(tbase_get_base(timer->base) != base);
internal_add_timer(base, timer);
}
@@ -588,7 +629,7 @@ static inline void __run_timers(tvec_base_t *base)
void (*fn)(unsigned long);
unsigned long data;
- timer = list_entry(head->next,struct timer_list,entry);
+ timer = list_first_entry(head, struct timer_list,entry);
fn = timer->function;
data = timer->data;
@@ -634,6 +675,9 @@ static unsigned long __next_timer_interrupt(tvec_base_t *base)
index = slot = timer_jiffies & TVR_MASK;
do {
list_for_each_entry(nte, base->tv1.vec + slot, entry) {
+ if (tbase_get_deferrable(nte->base))
+ continue;
+
found = 1;
expires = nte->expires;
/* Look at the cascade bucket(s)? */
@@ -695,15 +739,28 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
{
ktime_t hr_delta = hrtimer_get_next_event();
struct timespec tsdelta;
+ unsigned long delta;
if (hr_delta.tv64 == KTIME_MAX)
return expires;
- if (hr_delta.tv64 <= TICK_NSEC)
- return now;
+ /*
+ * Expired timer available, let it expire in the next tick
+ */
+ if (hr_delta.tv64 <= 0)
+ return now + 1;
tsdelta = ktime_to_timespec(hr_delta);
- now += timespec_to_jiffies(&tsdelta);
+ delta = timespec_to_jiffies(&tsdelta);
+ /*
+ * Take rounding errors in to account and make sure, that it
+ * expires in the next tick. Otherwise we go into an endless
+ * ping pong due to tick_nohz_stop_sched_tick() retriggering
+ * the timer softirq
+ */
+ if (delta < 1)
+ delta = 1;
+ now += delta;
if (time_before(now, expires))
return now;
return expires;
@@ -737,455 +794,6 @@ unsigned long next_timer_interrupt(void)
#endif
-/******************************************************************/
-
-/*
- * The current time
- * wall_to_monotonic is what we need to add to xtime (or xtime corrected
- * for sub jiffie times) to get to monotonic time. Monotonic is pegged
- * at zero at system boot time, so wall_to_monotonic will be negative,
- * however, we will ALWAYS keep the tv_nsec part positive so we can use
- * the usual normalization.
- */
-struct timespec xtime __attribute__ ((aligned (16)));
-struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
-
-EXPORT_SYMBOL(xtime);
-
-
-/* XXX - all of this timekeeping code should be later moved to time.c */
-#include <linux/clocksource.h>
-static struct clocksource *clock; /* pointer to current clocksource */
-
-#ifdef CONFIG_GENERIC_TIME
-/**
- * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
- *
- * private function, must hold xtime_lock lock when being
- * called. Returns the number of nanoseconds since the
- * last call to update_wall_time() (adjusted by NTP scaling)
- */
-static inline s64 __get_nsec_offset(void)
-{
- cycle_t cycle_now, cycle_delta;
- s64 ns_offset;
-
- /* read clocksource: */
- cycle_now = clocksource_read(clock);
-
- /* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-
- /* convert to nanoseconds: */
- ns_offset = cyc2ns(clock, cycle_delta);
-
- return ns_offset;
-}
-
-/**
- * __get_realtime_clock_ts - Returns the time of day in a timespec
- * @ts: pointer to the timespec to be set
- *
- * Returns the time of day in a timespec. Used by
- * do_gettimeofday() and get_realtime_clock_ts().
- */
-static inline void __get_realtime_clock_ts(struct timespec *ts)
-{
- unsigned long seq;
- s64 nsecs;
-
- do {
- seq = read_seqbegin(&xtime_lock);
-
- *ts = xtime;
- nsecs = __get_nsec_offset();
-
- } while (read_seqretry(&xtime_lock, seq));
-
- timespec_add_ns(ts, nsecs);
-}
-
-/**
- * getnstimeofday - Returns the time of day in a timespec
- * @ts: pointer to the timespec to be set
- *
- * Returns the time of day in a timespec.
- */
-void getnstimeofday(struct timespec *ts)
-{
- __get_realtime_clock_ts(ts);
-}
-
-EXPORT_SYMBOL(getnstimeofday);
-
-/**
- * do_gettimeofday - Returns the time of day in a timeval
- * @tv: pointer to the timeval to be set
- *
- * NOTE: Users should be converted to using get_realtime_clock_ts()
- */
-void do_gettimeofday(struct timeval *tv)
-{
- struct timespec now;
-
- __get_realtime_clock_ts(&now);
- tv->tv_sec = now.tv_sec;
- tv->tv_usec = now.tv_nsec/1000;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-/**
- * do_settimeofday - Sets the time of day
- * @tv: pointer to the timespec variable containing the new time
- *
- * Sets the time of day to the new time and update NTP and notify hrtimers
- */
-int do_settimeofday(struct timespec *tv)
-{
- unsigned long flags;
- time_t wtm_sec, sec = tv->tv_sec;
- long wtm_nsec, nsec = tv->tv_nsec;
-
- if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
- return -EINVAL;
-
- write_seqlock_irqsave(&xtime_lock, flags);
-
- nsec -= __get_nsec_offset();
-
- wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
- wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
- set_normalized_timespec(&xtime, sec, nsec);
- set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
- clock->error = 0;
- ntp_clear();
-
- update_vsyscall(&xtime, clock);
-
- write_sequnlock_irqrestore(&xtime_lock, flags);
-
- /* signal hrtimers about time change */
- clock_was_set();
-
- return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
-
-/**
- * change_clocksource - Swaps clocksources if a new one is available
- *
- * Accumulates current time interval and initializes new clocksource
- */
-static void change_clocksource(void)
-{
- struct clocksource *new;
- cycle_t now;
- u64 nsec;
-
- new = clocksource_get_next();
-
- if (clock == new)
- return;
-
- now = clocksource_read(new);
- nsec = __get_nsec_offset();
- timespec_add_ns(&xtime, nsec);
-
- clock = new;
- clock->cycle_last = now;
-
- clock->error = 0;
- clock->xtime_nsec = 0;
- clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
-
- tick_clock_notify();
-
- printk(KERN_INFO "Time: %s clocksource has been installed.\n",
- clock->name);
-}
-#else
-static inline void change_clocksource(void) { }
-#endif
-
-/**
- * timekeeping_is_continuous - check to see if timekeeping is free running
- */
-int timekeeping_is_continuous(void)
-{
- unsigned long seq;
- int ret;
-
- do {
- seq = read_seqbegin(&xtime_lock);
-
- ret = clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
-
- } while (read_seqretry(&xtime_lock, seq));
-
- return ret;
-}
-
-/**
- * read_persistent_clock - Return time in seconds from the persistent clock.
- *
- * Weak dummy function for arches that do not yet support it.
- * Returns seconds from epoch using the battery backed persistent clock.
- * Returns zero if unsupported.
- *
- * XXX - Do be sure to remove it once all arches implement it.
- */
-unsigned long __attribute__((weak)) read_persistent_clock(void)
-{
- return 0;
-}
-
-/*
- * timekeeping_init - Initializes the clocksource and common timekeeping values
- */
-void __init timekeeping_init(void)
-{
- unsigned long flags;
- unsigned long sec = read_persistent_clock();
-
- write_seqlock_irqsave(&xtime_lock, flags);
-
- ntp_clear();
-
- clock = clocksource_get_next();
- clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
- clock->cycle_last = clocksource_read(clock);
-
- xtime.tv_sec = sec;
- xtime.tv_nsec = 0;
- set_normalized_timespec(&wall_to_monotonic,
- -xtime.tv_sec, -xtime.tv_nsec);
-
- write_sequnlock_irqrestore(&xtime_lock, flags);
-}
-
-/* flag for if timekeeping is suspended */
-static int timekeeping_suspended;
-/* time in seconds when suspend began */
-static unsigned long timekeeping_suspend_time;
-
-/**
- * timekeeping_resume - Resumes the generic timekeeping subsystem.
- * @dev: unused
- *
- * This is for the generic clocksource timekeeping.
- * xtime/wall_to_monotonic/jiffies/etc are
- * still managed by arch specific suspend/resume code.
- */
-static int timekeeping_resume(struct sys_device *dev)
-{
- unsigned long flags;
- unsigned long now = read_persistent_clock();
-
- write_seqlock_irqsave(&xtime_lock, flags);
-
- if (now && (now > timekeeping_suspend_time)) {
- unsigned long sleep_length = now - timekeeping_suspend_time;
-
- xtime.tv_sec += sleep_length;
- wall_to_monotonic.tv_sec -= sleep_length;
- }
- /* re-base the last cycle value */
- clock->cycle_last = clocksource_read(clock);
- clock->error = 0;
- timekeeping_suspended = 0;
- write_sequnlock_irqrestore(&xtime_lock, flags);
-
- touch_softlockup_watchdog();
-
- clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-
- /* Resume hrtimers */
- clock_was_set();
-
- return 0;
-}
-
-static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
-{
- unsigned long flags;
-
- write_seqlock_irqsave(&xtime_lock, flags);
- timekeeping_suspended = 1;
- timekeeping_suspend_time = read_persistent_clock();
- write_sequnlock_irqrestore(&xtime_lock, flags);
-
- clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
-
- return 0;
-}
-
-/* sysfs resume/suspend bits for timekeeping */
-static struct sysdev_class timekeeping_sysclass = {
- .resume = timekeeping_resume,
- .suspend = timekeeping_suspend,
- set_kset_name("timekeeping"),
-};
-
-static struct sys_device device_timer = {
- .id = 0,
- .cls = &timekeeping_sysclass,
-};
-
-static int __init timekeeping_init_device(void)
-{
- int error = sysdev_class_register(&timekeeping_sysclass);
- if (!error)
- error = sysdev_register(&device_timer);
- return error;
-}
-
-device_initcall(timekeeping_init_device);
-
-/*
- * If the error is already larger, we look ahead even further
- * to compensate for late or lost adjustments.
- */
-static __always_inline int clocksource_bigadjust(s64 error, s64 *interval,
- s64 *offset)
-{
- s64 tick_error, i;
- u32 look_ahead, adj;
- s32 error2, mult;
-
- /*
- * Use the current error value to determine how much to look ahead.
- * The larger the error the slower we adjust for it to avoid problems
- * with losing too many ticks, otherwise we would overadjust and
- * produce an even larger error. The smaller the adjustment the
- * faster we try to adjust for it, as lost ticks can do less harm
- * here. This is tuned so that an error of about 1 msec is adusted
- * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
- */
- error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
- error2 = abs(error2);
- for (look_ahead = 0; error2 > 0; look_ahead++)
- error2 >>= 2;
-
- /*
- * Now calculate the error in (1 << look_ahead) ticks, but first
- * remove the single look ahead already included in the error.
- */
- tick_error = current_tick_length() >>
- (TICK_LENGTH_SHIFT - clock->shift + 1);
- tick_error -= clock->xtime_interval >> 1;
- error = ((error - tick_error) >> look_ahead) + tick_error;
-
- /* Finally calculate the adjustment shift value. */
- i = *interval;
- mult = 1;
- if (error < 0) {
- error = -error;
- *interval = -*interval;
- *offset = -*offset;
- mult = -1;
- }
- for (adj = 0; error > i; adj++)
- error >>= 1;
-
- *interval <<= adj;
- *offset <<= adj;
- return mult << adj;
-}
-
-/*
- * Adjust the multiplier to reduce the error value,
- * this is optimized for the most common adjustments of -1,0,1,
- * for other values we can do a bit more work.
- */
-static void clocksource_adjust(struct clocksource *clock, s64 offset)
-{
- s64 error, interval = clock->cycle_interval;
- int adj;
-
- error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
- if (error > interval) {
- error >>= 2;
- if (likely(error <= interval))
- adj = 1;
- else
- adj = clocksource_bigadjust(error, &interval, &offset);
- } else if (error < -interval) {
- error >>= 2;
- if (likely(error >= -interval)) {
- adj = -1;
- interval = -interval;
- offset = -offset;
- } else
- adj = clocksource_bigadjust(error, &interval, &offset);
- } else
- return;
-
- clock->mult += adj;
- clock->xtime_interval += interval;
- clock->xtime_nsec -= offset;
- clock->error -= (interval - offset) <<
- (TICK_LENGTH_SHIFT - clock->shift);
-}
-
-/**
- * update_wall_time - Uses the current clocksource to increment the wall time
- *
- * Called from the timer interrupt, must hold a write on xtime_lock.
- */
-static void update_wall_time(void)
-{
- cycle_t offset;
-
- /* Make sure we're fully resumed: */
- if (unlikely(timekeeping_suspended))
- return;
-
-#ifdef CONFIG_GENERIC_TIME
- offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
-#else
- offset = clock->cycle_interval;
-#endif
- clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
-
- /* normally this loop will run just once, however in the
- * case of lost or late ticks, it will accumulate correctly.
- */
- while (offset >= clock->cycle_interval) {
- /* accumulate one interval */
- clock->xtime_nsec += clock->xtime_interval;
- clock->cycle_last += clock->cycle_interval;
- offset -= clock->cycle_interval;
-
- if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
- clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
- xtime.tv_sec++;
- second_overflow();
- }
-
- /* interpolator bits */
- time_interpolator_update(clock->xtime_interval
- >> clock->shift);
-
- /* accumulate error between NTP and clock interval */
- clock->error += current_tick_length();
- clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
- }
-
- /* correct the clock when NTP error is too big */
- clocksource_adjust(clock, offset);
-
- /* store full nanoseconds into xtime */
- xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
- clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
-
- /* check to see if there is a new clocksource to use */
- change_clocksource();
- update_vsyscall(&xtime, clock);
-}
-
/*
* Called from the timer interrupt handler to charge one tick to the current
* process. user_tick is 1 if the tick is user time, 0 for system.
@@ -1249,14 +857,6 @@ static inline void calc_load(unsigned long ticks)
}
/*
- * This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
- */
-__attribute__((weak)) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
-
-EXPORT_SYMBOL(xtime_lock);
-
-/*
* This function runs timers and the timer-tq in bottom half context.
*/
static void run_timer_softirq(struct softirq_action *h)
@@ -1602,6 +1202,13 @@ static int __devinit init_timers_cpu(int cpu)
cpu_to_node(cpu));
if (!base)
return -ENOMEM;
+
+ /* Make sure that tvec_base is 2 byte aligned */
+ if (tbase_get_deferrable(base)) {
+ WARN_ON(1);
+ kfree(base);
+ return -ENOMEM;
+ }
memset(base, 0, sizeof(*base));
per_cpu(tvec_bases, cpu) = base;
} else {
@@ -1641,9 +1248,9 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
struct timer_list *timer;
while (!list_empty(head)) {
- timer = list_entry(head->next, struct timer_list, entry);
+ timer = list_first_entry(head, struct timer_list, entry);
detach_timer(timer, 0);
- timer->base = new_base;
+ timer_set_base(timer, new_base);
internal_add_timer(new_base, timer);
}
}
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 187e2a42387..dd308ba4e03 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -6,7 +6,6 @@
#include <linux/mm.h>
#include <linux/utsname.h>
#include <linux/mman.h>
-#include <linux/smp_lock.h>
#include <linux/notifier.h>
#include <linux/reboot.h>
#include <linux/prctl.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index c859164a699..160c8c5136b 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -32,58 +32,25 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
}
/*
- * unshare the current process' utsname namespace.
- * called only in sys_unshare()
- */
-int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts)
-{
- if (unshare_flags & CLONE_NEWUTS) {
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- *new_uts = clone_uts_ns(current->nsproxy->uts_ns);
- if (!*new_uts)
- return -ENOMEM;
- }
-
- return 0;
-}
-
-/*
* Copy task tsk's utsname namespace, or clone it if flags
* specifies CLONE_NEWUTS. In latter case, changes to the
* utsname of this process won't be seen by parent, and vice
* versa.
*/
-int copy_utsname(int flags, struct task_struct *tsk)
+struct uts_namespace *copy_utsname(int flags, struct uts_namespace *old_ns)
{
- struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
struct uts_namespace *new_ns;
- int err = 0;
-
- if (!old_ns)
- return 0;
+ BUG_ON(!old_ns);
get_uts_ns(old_ns);
if (!(flags & CLONE_NEWUTS))
- return 0;
-
- if (!capable(CAP_SYS_ADMIN)) {
- err = -EPERM;
- goto out;
- }
+ return old_ns;
new_ns = clone_uts_ns(old_ns);
- if (!new_ns) {
- err = -ENOMEM;
- goto out;
- }
- tsk->nsproxy->uts_ns = new_ns;
-out:
put_uts_ns(old_ns);
- return err;
+ return new_ns;
}
void free_uts_ns(struct kref *kref)