diff options
Diffstat (limited to 'kernel')
35 files changed, 1829 insertions, 922 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore new file mode 100644 index 00000000000..f2ab70073bd --- /dev/null +++ b/kernel/.gitignore @@ -0,0 +1,5 @@ +# +# Generated files +# +config_data.h +config_data.gz diff --git a/kernel/acct.c b/kernel/acct.c index 6312d6bd43e..38d57fa6b78 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -427,6 +427,7 @@ static void do_acct_process(long exitcode, struct file *file) u64 elapsed; u64 run_time; struct timespec uptime; + unsigned long jiffies; /* * First check to see if there is enough free_space to continue @@ -467,12 +468,12 @@ static void do_acct_process(long exitcode, struct file *file) #endif do_div(elapsed, AHZ); ac.ac_btime = xtime.tv_sec - elapsed; - ac.ac_utime = encode_comp_t(jiffies_to_AHZ( - current->signal->utime + - current->group_leader->utime)); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ( - current->signal->stime + - current->group_leader->stime)); + jiffies = cputime_to_jiffies(cputime_add(current->group_leader->utime, + current->signal->utime)); + ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); + jiffies = cputime_to_jiffies(cputime_add(current->group_leader->stime, + current->signal->stime)); + ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); /* we really need to bite the bullet and change layout */ ac.ac_uid = current->uid; ac.ac_gid = current->gid; @@ -580,7 +581,8 @@ void acct_process(long exitcode) void acct_update_integrals(struct task_struct *tsk) { if (likely(tsk->mm)) { - long delta = tsk->stime - tsk->acct_stimexpd; + long delta = + cputime_to_jiffies(tsk->stime) - tsk->acct_stimexpd; if (delta == 0) return; diff --git a/kernel/audit.c b/kernel/audit.c index 0c56320d38d..d13ab7d2d89 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -267,7 +267,7 @@ static int audit_set_failure(int state, uid_t loginuid) return old; } -int kauditd_thread(void *dummy) +static int kauditd_thread(void *dummy) { struct sk_buff *skb; @@ -291,8 +291,10 @@ int kauditd_thread(void *dummy) set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&kauditd_wait, &wait); - if (!skb_queue_len(&audit_skb_queue)) + if (!skb_queue_len(&audit_skb_queue)) { + try_to_freeze(); schedule(); + } __set_current_state(TASK_RUNNING); remove_wait_queue(&kauditd_wait, &wait); diff --git a/kernel/configs.c b/kernel/configs.c index 986f7af31e0..009e1ebdcb8 100644 --- a/kernel/configs.c +++ b/kernel/configs.c @@ -3,7 +3,7 @@ * Echo the kernel .config file used to build the kernel * * Copyright (C) 2002 Khalid Aziz <khalid_aziz@hp.com> - * Copyright (C) 2002 Randy Dunlap <rddunlap@osdl.org> + * Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net> * Copyright (C) 2002 Al Stone <ahs3@fc.hp.com> * Copyright (C) 2002 Hewlett-Packard Company * diff --git a/kernel/cpu.c b/kernel/cpu.c index d61ba88f34e..e882c6babf4 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -16,47 +16,76 @@ #include <asm/semaphore.h> /* This protects CPUs going up and down... */ -DECLARE_MUTEX(cpucontrol); -EXPORT_SYMBOL_GPL(cpucontrol); +static DECLARE_MUTEX(cpucontrol); static struct notifier_block *cpu_chain; -/* - * Used to check by callers if they need to acquire the cpucontrol - * or not to protect a cpu from being removed. Its sometimes required to - * call these functions both for normal operations, and in response to - * a cpu being added/removed. If the context of the call is in the same - * thread context as a CPU hotplug thread, we dont need to take the lock - * since its already protected - * check drivers/cpufreq/cpufreq.c for its usage - Ashok Raj - */ +#ifdef CONFIG_HOTPLUG_CPU +static struct task_struct *lock_cpu_hotplug_owner; +static int lock_cpu_hotplug_depth; -int current_in_cpu_hotplug(void) +static int __lock_cpu_hotplug(int interruptible) { - return (current->flags & PF_HOTPLUG_CPU); + int ret = 0; + + if (lock_cpu_hotplug_owner != current) { + if (interruptible) + ret = down_interruptible(&cpucontrol); + else + down(&cpucontrol); + } + + /* + * Set only if we succeed in locking + */ + if (!ret) { + lock_cpu_hotplug_depth++; + lock_cpu_hotplug_owner = current; + } + + return ret; } -EXPORT_SYMBOL_GPL(current_in_cpu_hotplug); +void lock_cpu_hotplug(void) +{ + __lock_cpu_hotplug(0); +} +EXPORT_SYMBOL_GPL(lock_cpu_hotplug); +void unlock_cpu_hotplug(void) +{ + if (--lock_cpu_hotplug_depth == 0) { + lock_cpu_hotplug_owner = NULL; + up(&cpucontrol); + } +} +EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); + +int lock_cpu_hotplug_interruptible(void) +{ + return __lock_cpu_hotplug(1); +} +EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); +#endif /* CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ int register_cpu_notifier(struct notifier_block *nb) { int ret; - if ((ret = down_interruptible(&cpucontrol)) != 0) + if ((ret = lock_cpu_hotplug_interruptible()) != 0) return ret; ret = notifier_chain_register(&cpu_chain, nb); - up(&cpucontrol); + unlock_cpu_hotplug(); return ret; } EXPORT_SYMBOL(register_cpu_notifier); void unregister_cpu_notifier(struct notifier_block *nb) { - down(&cpucontrol); + lock_cpu_hotplug(); notifier_chain_unregister(&cpu_chain, nb); - up(&cpucontrol); + unlock_cpu_hotplug(); } EXPORT_SYMBOL(unregister_cpu_notifier); @@ -112,13 +141,6 @@ int cpu_down(unsigned int cpu) goto out; } - /* - * Leave a trace in current->flags indicating we are already in - * process of performing CPU hotplug. Callers can check if cpucontrol - * is already acquired by current thread, and if so not cause - * a dead lock by not acquiring the lock - */ - current->flags |= PF_HOTPLUG_CPU; err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, (void *)(long)cpu); if (err == NOTIFY_BAD) { @@ -171,7 +193,6 @@ out_thread: out_allowed: set_cpus_allowed(current, old_allowed); out: - current->flags &= ~PF_HOTPLUG_CPU; unlock_cpu_hotplug(); return err; } @@ -182,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu) int ret; void *hcpu = (void *)(long)cpu; - if ((ret = down_interruptible(&cpucontrol)) != 0) + if ((ret = lock_cpu_hotplug_interruptible()) != 0) return ret; if (cpu_online(cpu) || !cpu_present(cpu)) { @@ -190,11 +211,6 @@ int __devinit cpu_up(unsigned int cpu) goto out; } - /* - * Leave a trace in current->flags indicating we are already in - * process of performing CPU hotplug. - */ - current->flags |= PF_HOTPLUG_CPU; ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); if (ret == NOTIFY_BAD) { printk("%s: attempt to bring up CPU %u failed\n", @@ -217,7 +233,6 @@ out_notify: if (ret != 0) notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); out: - current->flags &= ~PF_HOTPLUG_CPU; - up(&cpucontrol); + unlock_cpu_hotplug(); return ret; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7430640f981..eab64e23bca 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -39,6 +39,7 @@ #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/proc_fs.h> +#include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/seq_file.h> #include <linux/slab.h> @@ -54,7 +55,23 @@ #include <asm/atomic.h> #include <asm/semaphore.h> -#define CPUSET_SUPER_MAGIC 0x27e0eb +#define CPUSET_SUPER_MAGIC 0x27e0eb + +/* + * Tracks how many cpusets are currently defined in system. + * When there is only one cpuset (the root cpuset) we can + * short circuit some hooks. + */ +int number_of_cpusets __read_mostly; + +/* See "Frequency meter" comments, below. */ + +struct fmeter { + int cnt; /* unprocessed events count */ + int val; /* most recent output value */ + time_t time; /* clock (secs) when val computed */ + spinlock_t lock; /* guards read or write of above */ +}; struct cpuset { unsigned long flags; /* "unsigned long" so bitops work */ @@ -80,13 +97,16 @@ struct cpuset { * Copy of global cpuset_mems_generation as of the most * recent time this cpuset changed its mems_allowed. */ - int mems_generation; + int mems_generation; + + struct fmeter fmeter; /* memory_pressure filter */ }; /* bits in struct cpuset flags field */ typedef enum { CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, + CS_MEMORY_MIGRATE, CS_REMOVED, CS_NOTIFY_ON_RELEASE } cpuset_flagbits_t; @@ -112,6 +132,11 @@ static inline int notify_on_release(const struct cpuset *cs) return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); } +static inline int is_memory_migrate(const struct cpuset *cs) +{ + return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); +} + /* * Increment this atomic integer everytime any cpuset changes its * mems_allowed value. Users of cpusets can track this generation @@ -137,13 +162,10 @@ static struct cpuset top_cpuset = { .count = ATOMIC_INIT(0), .sibling = LIST_HEAD_INIT(top_cpuset.sibling), .children = LIST_HEAD_INIT(top_cpuset.children), - .parent = NULL, - .dentry = NULL, - .mems_generation = 0, }; static struct vfsmount *cpuset_mount; -static struct super_block *cpuset_sb = NULL; +static struct super_block *cpuset_sb; /* * We have two global cpuset semaphores below. They can nest. @@ -227,6 +249,11 @@ static struct super_block *cpuset_sb = NULL; * a tasks cpuset pointer we use task_lock(), which acts on a spinlock * (task->alloc_lock) already in the task_struct routinely used for * such matters. + * + * P.S. One more locking exception. RCU is used to guard the + * update of a tasks cpuset pointer by attach_task() and the + * access of task->cpuset->mems_generation via that pointer in + * the routine cpuset_update_task_memory_state(). */ static DECLARE_MUTEX(manage_sem); @@ -304,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) spin_lock(&dcache_lock); node = dentry->d_subdirs.next; while (node != &dentry->d_subdirs) { - struct dentry *d = list_entry(node, struct dentry, d_child); + struct dentry *d = list_entry(node, struct dentry, d_u.d_child); list_del_init(node); if (d->d_inode) { d = dget_locked(d); @@ -316,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) } node = dentry->d_subdirs.next; } - list_del_init(&dentry->d_child); + list_del_init(&dentry->d_u.d_child); spin_unlock(&dcache_lock); remove_dir(dentry); } @@ -570,20 +597,43 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) BUG_ON(!nodes_intersects(*pmask, node_online_map)); } -/* - * Refresh current tasks mems_allowed and mems_generation from current - * tasks cpuset. +/** + * cpuset_update_task_memory_state - update task memory placement * - * Call without callback_sem or task_lock() held. May be called with - * or without manage_sem held. Will acquire task_lock() and might - * acquire callback_sem during call. + * If the current tasks cpusets mems_allowed changed behind our + * backs, update current->mems_allowed, mems_generation and task NUMA + * mempolicy to the new value. + * + * Task mempolicy is updated by rebinding it relative to the + * current->cpuset if a task has its memory placement changed. + * Do not call this routine if in_interrupt(). * - * The task_lock() is required to dereference current->cpuset safely. - * Without it, we could pick up the pointer value of current->cpuset - * in one instruction, and then attach_task could give us a different - * cpuset, and then the cpuset we had could be removed and freed, - * and then on our next instruction, we could dereference a no longer - * valid cpuset pointer to get its mems_generation field. + * Call without callback_sem or task_lock() held. May be called + * with or without manage_sem held. Doesn't need task_lock to guard + * against another task changing a non-NULL cpuset pointer to NULL, + * as that is only done by a task on itself, and if the current task + * is here, it is not simultaneously in the exit code NULL'ing its + * cpuset pointer. This routine also might acquire callback_sem and + * current->mm->mmap_sem during call. + * + * Reading current->cpuset->mems_generation doesn't need task_lock + * to guard the current->cpuset derefence, because it is guarded + * from concurrent freeing of current->cpuset by attach_task(), + * using RCU. + * + * The rcu_dereference() is technically probably not needed, + * as I don't actually mind if I see a new cpuset pointer but + * an old value of mems_generation. However this really only + * matters on alpha systems using cpusets heavily. If I dropped + * that rcu_dereference(), it would save them a memory barrier. + * For all other arch's, rcu_dereference is a no-op anyway, and for + * alpha systems not using cpusets, another planned optimization, + * avoiding the rcu critical section for tasks in the root cpuset + * which is statically allocated, so can't vanish, will make this + * irrelevant. Better to use RCU as intended, than to engage in + * some cute trick to save a memory barrier that is impossible to + * test, for alpha systems using cpusets heavily, which might not + * even exist. * * This routine is needed to update the per-task mems_allowed data, * within the tasks context, when it is trying to allocate memory @@ -591,27 +641,31 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * task has been modifying its cpuset. */ -static void refresh_mems(void) +void cpuset_update_task_memory_state() { int my_cpusets_mem_gen; + struct task_struct *tsk = current; + struct cpuset *cs; - task_lock(current); - my_cpusets_mem_gen = current->cpuset->mems_generation; - task_unlock(current); - - if (current->cpuset_mems_generation != my_cpusets_mem_gen) { - struct cpuset *cs; - nodemask_t oldmem = current->mems_allowed; + if (tsk->cpuset == &top_cpuset) { + /* Don't need rcu for top_cpuset. It's never freed. */ + my_cpusets_mem_gen = top_cpuset.mems_generation; + } else { + rcu_read_lock(); + cs = rcu_dereference(tsk->cpuset); + my_cpusets_mem_gen = cs->mems_generation; + rcu_read_unlock(); + } + if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { down(&callback_sem); - task_lock(current); - cs = current->cpuset; - guarantee_online_mems(cs, ¤t->mems_allowed); - current->cpuset_mems_generation = cs->mems_generation; - task_unlock(current); + task_lock(tsk); + cs = tsk->cpuset; /* Maybe changed when task not locked */ + guarantee_online_mems(cs, &tsk->mems_allowed); + tsk->cpuset_mems_generation = cs->mems_generation; + task_unlock(tsk); up(&callback_sem); - if (!nodes_equal(oldmem, current->mems_allowed)) - numa_policy_rebind(&oldmem, ¤t->mems_allowed); + mpol_rebind_task(tsk, &tsk->mems_allowed); } } @@ -766,36 +820,150 @@ static int update_cpumask(struct cpuset *cs, char *buf) } /* + * Handle user request to change the 'mems' memory placement + * of a cpuset. Needs to validate the request, update the + * cpusets mems_allowed and mems_generation, and for each + * task in the cpuset, rebind any vma mempolicies and if + * the cpuset is marked 'memory_migrate', migrate the tasks + * pages to the new memory. + * * Call with manage_sem held. May take callback_sem during call. + * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, + * lock each such tasks mm->mmap_sem, scan its vma's and rebind + * their mempolicies to the cpusets new mems_allowed. */ static int update_nodemask(struct cpuset *cs, char *buf) { struct cpuset trialcs; + nodemask_t oldmem; + struct task_struct *g, *p; + struct mm_struct **mmarray; + int i, n, ntasks; + int migrate; + int fudge; int retval; trialcs = *cs; retval = nodelist_parse(buf, trialcs.mems_allowed); if (retval < 0) - return retval; + goto done; nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); - if (nodes_empty(trialcs.mems_allowed)) - return -ENOSPC; + oldmem = cs->mems_allowed; + if (nodes_equal(oldmem, trialcs.mems_allowed)) { + retval = 0; /* Too easy - nothing to do */ + goto done; + } + if (nodes_empty(trialcs.mems_allowed)) { + retval = -ENOSPC; + goto done; + } retval = validate_change(cs, &trialcs); - if (retval == 0) { - down(&callback_sem); - cs->mems_allowed = trialcs.mems_allowed; - atomic_inc(&cpuset_mems_generation); - cs->mems_generation = atomic_read(&cpuset_mems_generation); - up(&callback_sem); + if (retval < 0) + goto done; + + down(&callback_sem); + cs->mems_allowed = trialcs.mems_allowed; + atomic_inc(&cpuset_mems_generation); + cs->mems_generation = atomic_read(&cpuset_mems_generation); + up(&callback_sem); + + set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ + + fudge = 10; /* spare mmarray[] slots */ + fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ + retval = -ENOMEM; + + /* + * Allocate mmarray[] to hold mm reference for each task + * in cpuset cs. Can't kmalloc GFP_KERNEL while holding + * tasklist_lock. We could use GFP_ATOMIC, but with a + * few more lines of code, we can retry until we get a big + * enough mmarray[] w/o using GFP_ATOMIC. + */ + while (1) { + ntasks = atomic_read(&cs->count); /* guess */ + ntasks += fudge; + mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); + if (!mmarray) + goto done; + write_lock_irq(&tasklist_lock); /* block fork */ + if (atomic_read(&cs->count) <= ntasks) + break; /* got enough */ + write_unlock_irq(&tasklist_lock); /* try again */ + kfree(mmarray); } + + n = 0; + + /* Load up mmarray[] with mm reference for each task in cpuset. */ + do_each_thread(g, p) { + struct mm_struct *mm; + + if (n >= ntasks) { + printk(KERN_WARNING + "Cpuset mempolicy rebind incomplete.\n"); + continue; + } + if (p->cpuset != cs) + continue; + mm = get_task_mm(p); + if (!mm) + continue; + mmarray[n++] = mm; + } while_each_thread(g, p); + write_unlock_irq(&tasklist_lock); + + /* + * Now that we've dropped the tasklist spinlock, we can + * rebind the vma mempolicies of each mm in mmarray[] to their + * new cpuset, and release that mm. The mpol_rebind_mm() + * call takes mmap_sem, which we couldn't take while holding + * tasklist_lock. Forks can happen again now - the mpol_copy() + * cpuset_being_rebound check will catch such forks, and rebind + * their vma mempolicies too. Because we still hold the global + * cpuset manage_sem, we know that no other rebind effort will + * be contending for the global variable cpuset_being_rebound. + * It's ok if we rebind the same mm twice; mpol_rebind_mm() + * is idempotent. Also migrate pages in each mm to new nodes. + */ + migrate = is_memory_migrate(cs); + for (i = 0; i < n; i++) { + struct mm_struct *mm = mmarray[i]; + + mpol_rebind_mm(mm, &cs->mems_allowed); + if (migrate) { + do_migrate_pages(mm, &oldmem, &cs->mems_allowed, + MPOL_MF_MOVE_ALL); + } + mmput(mm); + } + + /* We're done rebinding vma's to this cpusets new mems_allowed. */ + kfree(mmarray); + set_cpuset_being_rebound(NULL); + retval = 0; +done: return retval; } /* + * Call with manage_sem held. + */ + +static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) +{ + if (simple_strtoul(buf, NULL, 10) != 0) + cpuset_memory_pressure_enabled = 1; + else + cpuset_memory_pressure_enabled = 0; + return 0; +} + +/* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, - * CS_NOTIFY_ON_RELEASE) + * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) * cs: the cpuset to update * buf: the buffer where we read the 0 or 1 * @@ -834,6 +1002,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) } /* + * Frequency meter - How fast is some event occuring? + * + * These routines manage a digitally filtered, constant time based, + * event frequency meter. There are four routines: + * fmeter_init() - initialize a frequency meter. + * fmeter_markevent() - called each time the event happens. + * fmeter_getrate() - returns the recent rate of such events. + * fmeter_update() - internal routine used to update fmeter. + * + * A common data structure is passed to each of these routines, + * which is used to keep track of the state required to manage the + * frequency meter and its digital filter. + * + * The filter works on the number of events marked per unit time. + * The filter is single-pole low-pass recursive (IIR). The time unit + * is 1 second. Arithmetic is done using 32-bit integers scaled to + * simulate 3 decimal digits of precision (multiplied by 1000). + * + * With an FM_COEF of 933, and a time base of 1 second, the filter + * has a half-life of 10 seconds, meaning that if the events quit + * happening, then the rate returned from the fmeter_getrate() + * will be cut in half each 10 seconds, until it converges to zero. + * + * It is not worth doing a real infinitely recursive filter. If more + * than FM_MAXTICKS ticks have elapsed since the last filter event, + * just compute FM_MAXTICKS ticks worth, by which point the level + * will be stable. + * + * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid + * arithmetic overflow in the fmeter_update() routine. + * + * Given the simple 32 bit integer arithmetic used, this meter works + * best for reporting rates between one per millisecond (msec) and + * one per 32 (approx) seconds. At constant rates faster than one + * per msec it maxes out at values just under 1,000,000. At constant + * rates between one per msec, and one per second it will stabilize + * to a value N*1000, where N is the rate of events per second. + * At constant rates between one per second and one per 32 seconds, + * it will be choppy, moving up on the seconds that have an event, + * and then decaying until the next event. At rates slower than + * about one in 32 seconds, it decays all the way back to zero between + * each event. + */ + +#define FM_COEF 933 /* coefficient for half-life of 10 secs */ +#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ +#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ +#define FM_SCALE 1000 /* faux fixed point scale */ + +/* Initialize a frequency meter */ +static void fmeter_init(struct fmeter *fmp) +{ + fmp->cnt = 0; + fmp->val = 0; + fmp->time = 0; + spin_lock_init(&fmp->lock); +} + +/* Internal meter update - process cnt events and update value */ +static void fmeter_update(struct fmeter *fmp) +{ + time_t now = get_seconds(); + time_t ticks = now - fmp->time; + + if (ticks == 0) + return; + + ticks = min(FM_MAXTICKS, ticks); + while (ticks-- > 0) + fmp->val = (FM_COEF * fmp->val) / FM_SCALE; + fmp->time = now; + + fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; + fmp->cnt = 0; +} + +/* Process any previous ticks, then bump cnt by one (times scale). */ +static void fmeter_markevent(struct fmeter *fmp) +{ + spin_lock(&fmp->lock); + fmeter_update(fmp); + fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); + spin_unlock(&fmp->lock); +} + +/* Process any previous ticks, then return current value. */ +static int fmeter_getrate(struct fmeter *fmp) +{ + int val; + + spin_lock(&fmp->lock); + fmeter_update(fmp); + val = fmp->val; + spin_unlock(&fmp->lock); + return val; +} + +/* * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly * writing the path of the old cpuset in 'ppathbuf' if it needs to be * notified on release. @@ -848,6 +1114,8 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) struct task_struct *tsk; struct cpuset *oldcs; cpumask_t cpus; + nodemask_t from, to; + struct mm_struct *mm; if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; @@ -887,14 +1155,27 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) return -ESRCH; } atomic_inc(&cs->count); - tsk->cpuset = cs; + rcu_assign_pointer(tsk->cpuset, cs); task_unlock(tsk); guarantee_online_cpus(cs, &cpus); set_cpus_allowed(tsk, cpus); + from = oldcs->mems_allowed; + to = cs->mems_allowed; + up(&callback_sem); + + mm = get_task_mm(tsk); + if (mm) { + mpol_rebind_mm(mm, &to); + mmput(mm); + } + + if (is_memory_migrate(cs)) + do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); put_task_struct(tsk); + synchronize_rcu(); if (atomic_dec_and_test(&oldcs->count)) check_for_release(oldcs, ppathbuf); return 0; @@ -905,11 +1186,14 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) typedef enum { FILE_ROOT, FILE_DIR, + FILE_MEMORY_MIGRATE, FILE_CPULIST, FILE_MEMLIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_NOTIFY_ON_RELEASE, + FILE_MEMORY_PRESSURE_ENABLED, + FILE_MEMORY_PRESSURE, FILE_TASKLIST, } cpuset_filetype_t; @@ -960,6 +1244,15 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us case FILE_NOTIFY_ON_RELEASE: retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); break; + case FILE_MEMORY_MIGRATE: + retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); + break; + case FILE_MEMORY_PRESSURE_ENABLED: + retval = update_memory_pressure_enabled(cs, buffer); + break; + case FILE_MEMORY_PRESSURE: + retval = -EACCES; + break; case FILE_TASKLIST: retval = attach_task(cs, buffer, &pathbuf); break; @@ -1060,6 +1353,15 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, case FILE_NOTIFY_ON_RELEASE: *s++ = notify_on_release(cs) ? '1' : '0'; break; + case FILE_MEMORY_MIGRATE: + *s++ = is_memory_migrate(cs) ? '1' : '0'; + break; + case FILE_MEMORY_PRESSURE_ENABLED: + *s++ = cpuset_memory_pressure_enabled ? '1' : '0'; + break; + case FILE_MEMORY_PRESSURE: + s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); + break; default: retval = -EINVAL; goto out; @@ -1178,7 +1480,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode) /* * cpuset_create_dir - create a directory for an object. - * cs: the cpuset we create the directory for. + * cs: the cpuset we create the directory for. * It must have a valid ->parent field * And we are going to fill its ->dentry field. * name: The name to give to the cpuset directory. Will be copied. @@ -1408,6 +1710,21 @@ static struct cftype cft_notify_on_release = { .private = FILE_NOTIFY_ON_RELEASE, }; +static struct cftype cft_memory_migrate = { + .name = "memory_migrate", + .private = FILE_MEMORY_MIGRATE, +}; + +static struct cftype cft_memory_pressure_enabled = { + .name = "memory_pressure_enabled", + .private = FILE_MEMORY_PRESSURE_ENABLED, +}; + +static struct cftype cft_memory_pressure = { + .name = "memory_pressure", + .private = FILE_MEMORY_PRESSURE, +}; + static int cpuset_populate_dir(struct dentry *cs_dentry) { int err; @@ -1422,6 +1739,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) return err; if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) return err; + if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) + return err; + if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) + return err; if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) return err; return 0; @@ -1446,7 +1767,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) return -ENOMEM; down(&manage_sem); - refresh_mems(); + cpuset_update_task_memory_state(); cs->flags = 0; if (notify_on_release(parent)) set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); @@ -1457,11 +1778,13 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) INIT_LIST_HEAD(&cs->children); atomic_inc(&cpuset_mems_generation); cs->mems_generation = atomic_read(&cpuset_mems_generation); + fmeter_init(&cs->fmeter); cs->parent = parent; down(&callback_sem); list_add(&cs->sibling, &cs->parent->children); + number_of_cpusets++; up(&callback_sem); err = cpuset_create_dir(cs, name, mode); @@ -1503,7 +1826,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) /* the vfs holds both inode->i_sem already */ down(&manage_sem); - refresh_mems(); + cpuset_update_task_memory_state(); if (atomic_read(&cs->count) > 0) { up(&manage_sem); return -EBUSY; @@ -1524,6 +1847,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) spin_unlock(&d->d_lock); cpuset_d_remove_dir(d); dput(d); + number_of_cpusets--; up(&callback_sem); if (list_empty(&parent->children)) check_for_release(parent, &pathbuf); @@ -1532,6 +1856,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) return 0; } +/* + * cpuset_init_early - just enough so that the calls to + * cpuset_update_task_memory_state() in early init code + * are harmless. + */ + +int __init cpuset_init_early(void) +{ + struct task_struct *tsk = current; + + tsk->cpuset = &top_cpuset; + tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation); + return 0; +} + /** * cpuset_init - initialize cpusets at system boot * @@ -1546,6 +1885,7 @@ int __init cpuset_init(void) top_cpuset.cpus_allowed = CPU_MASK_ALL; top_cpuset.mems_allowed = NODE_MASK_ALL; + fmeter_init(&top_cpuset.fmeter); atomic_inc(&cpuset_mems_generation); top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); @@ -1566,7 +1906,11 @@ int __init cpuset_init(void) root->d_inode->i_nlink++; top_cpuset.dentry = root; root->d_inode->i_op = &cpuset_dir_inode_operations; + number_of_cpusets = 1; err = cpuset_populate_dir(root); + /* memory_pressure_enabled is in root cpuset only */ + if (err == 0) + err = cpuset_add_file(root, &cft_memory_pressure_enabled); out: return err; } @@ -1632,15 +1976,13 @@ void cpuset_fork(struct task_struct *child) * * We don't need to task_lock() this reference to tsk->cpuset, * because tsk is already marked PF_EXITING, so attach_task() won't - * mess with it. + * mess with it, or task is a failed fork, never visible to attach_task. **/ void cpuset_exit(struct task_struct *tsk) { struct cpuset *cs; - BUG_ON(!(tsk->flags & PF_EXITING)); - cs = tsk->cpuset; tsk->cpuset = NULL; @@ -1667,14 +2009,14 @@ void cpuset_exit(struct task_struct *tsk) * tasks cpuset. **/ -cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) +cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) { cpumask_t mask; down(&callback_sem); - task_lock((struct task_struct *)tsk); + task_lock(tsk); guarantee_online_cpus(tsk->cpuset, &mask); - task_unlock((struct task_struct *)tsk); + task_unlock(tsk); up(&callback_sem); return mask; @@ -1686,43 +2028,26 @@ void cpuset_init_current_mems_allowed(void) } /** - * cpuset_update_current_mems_allowed - update mems parameters to new values - * - * If the current tasks cpusets mems_allowed changed behind our backs, - * update current->mems_allowed and mems_generation to the new value. - * Do not call this routine if in_interrupt(). + * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. * - * Call without callback_sem or task_lock() held. May be called - * with or without manage_sem held. Unless exiting, it will acquire - * task_lock(). Also might acquire callback_sem during call to - * refresh_mems(). - */ + * Description: Returns the nodemask_t mems_allowed of the cpuset + * attached to the specified @tsk. Guaranteed to return some non-empty + * subset of node_online_map, even if this means going outside the + * tasks cpuset. + **/ -void cpuset_update_current_mems_allowed(void) +nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { - struct cpuset *cs; - int need_to_refresh = 0; + nodemask_t mask; - task_lock(current); - cs = current->cpuset; - if (!cs) - goto done; - if (current->cpuset_mems_generation != cs->mems_generation) - need_to_refresh = 1; -done: - task_unlock(current); - if (need_to_refresh) - refresh_mems(); -} + down(&callback_sem); + task_lock(tsk); + guarantee_online_mems(tsk->cpuset, &mask); + task_unlock(tsk); + up(&callback_sem); -/** - * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed - * @nodes: pointer to a node bitmap that is and-ed with mems_allowed - */ -void cpuset_restrict_to_mems_allowed(unsigned long *nodes) -{ - bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed), - MAX_NUMNODES); + return mask; } /** @@ -1795,7 +2120,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) * GFP_USER - only nodes in current tasks mems allowed ok. **/ -int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { int node; /* node that zone z is on */ const struct cpuset *cs; /* current cpuset ancestors */ @@ -1867,6 +2192,42 @@ done: } /* + * Collection of memory_pressure is suppressed unless + * this flag is enabled by writing "1" to the special + * cpuset file 'memory_pressure_enabled' in the root cpuset. + */ + +int cpuset_memory_pressure_enabled __read_mostly; + +/** + * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. + * + * Keep a running average of the rate of synchronous (direct) + * page reclaim efforts initiated by tasks in each cpuset. + * + * This represents the rate at which some task in the cpuset + * ran low on memory on all nodes it was allowed to use, and + * had to enter the kernels page reclaim code in an effort to + * create more free memory by tossing clean pages or swapping + * or writing dirty pages. + * + * Display to user space in the per-cpuset read-only file + * "memory_pressure". Value displayed is an integer + * representing the recent rate of entry into the synchronous + * (direct) page reclaim by any task attached to the cpuset. + **/ + +void __cpuset_memory_pressure_bump(void) +{ + struct cpuset *cs; + + task_lock(current); + cs = current->cpuset; + fmeter_markevent(&cs->fmeter); + task_unlock(current); +} + +/* * proc_cpuset_show() * - Print tasks cpuset path into seq_file. * - Used for /proc/<pid>/cpuset. diff --git a/kernel/exit.c b/kernel/exit.c index ee515683b92..caceabf3f23 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -72,7 +72,6 @@ repeat: __ptrace_unlink(p); BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); __exit_signal(p); - __exit_sighand(p); /* * Note that the fastpath in sys_times depends on __exit_signal having * updated the counters before a task is removed from the tasklist of @@ -258,7 +257,7 @@ static inline void reparent_to_init(void) void __set_special_pids(pid_t session, pid_t pgrp) { - struct task_struct *curr = current; + struct task_struct *curr = current->group_leader; if (curr->signal->session != session) { detach_pid(curr, PIDTYPE_SID); @@ -926,7 +925,6 @@ do_group_exit(int exit_code) /* Another thread got here before we took the lock. */ exit_code = sig->group_exit_code; else { - sig->flags = SIGNAL_GROUP_EXIT; sig->group_exit_code = exit_code; zap_other_threads(current); } diff --git a/kernel/fork.c b/kernel/fork.c index 1c1cf8dc396..72e3252c676 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -743,6 +743,14 @@ int unshare_files(void) EXPORT_SYMBOL(unshare_files); +void sighand_free_cb(struct rcu_head *rhp) +{ + struct sighand_struct *sp; + + sp = container_of(rhp, struct sighand_struct, rcu); + kmem_cache_free(sighand_cachep, sp); +} + static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) { struct sighand_struct *sig; @@ -752,7 +760,7 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); - tsk->sighand = sig; + rcu_assign_pointer(tsk->sighand, sig); if (!sig) return -ENOMEM; spin_lock_init(&sig->siglock); @@ -803,9 +811,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts sig->it_prof_expires = cputime_zero; sig->it_prof_incr = cputime_zero; - sig->tty = current->signal->tty; - sig->pgrp = process_group(current); - sig->session = current->signal->session; sig->leader = 0; /* session leadership doesn't inherit */ sig->tty_old_pgrp = 0; @@ -964,12 +969,13 @@ static task_t *copy_process(unsigned long clone_flags, p->io_context = NULL; p->io_wait = NULL; p->audit_context = NULL; + cpuset_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_copy(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; - goto bad_fork_cleanup; + goto bad_fork_cleanup_cpuset; } #endif @@ -1124,29 +1130,22 @@ static task_t *copy_process(unsigned long clone_flags, if (unlikely(p->ptrace & PT_PTRACED)) __ptrace_link(p, current->parent); - cpuset_fork(p); - attach_pid(p, PIDTYPE_PID, p->pid); attach_pid(p, PIDTYPE_TGID, p->tgid); if (thread_group_leader(p)) { + p->signal->tty = current->signal->tty; + p->signal->pgrp = process_group(current); + p->signal->session = current->signal->session; attach_pid(p, PIDTYPE_PGID, process_group(p)); attach_pid(p, PIDTYPE_SID, p->signal->session); if (p->pid) __get_cpu_var(process_counts)++; } - proc_fork_connector(p); - if (!current->signal->tty && p->signal->tty) - p->signal->tty = NULL; - nr_threads++; total_forks++; write_unlock_irq(&tasklist_lock); - retval = 0; - -fork_out: - if (retval) - return ERR_PTR(retval); + proc_fork_connector(p); return p; bad_fork_cleanup_namespace: @@ -1173,7 +1172,9 @@ bad_fork_cleanup_security: bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_free(p->mempolicy); +bad_fork_cleanup_cpuset: #endif + cpuset_exit(p); bad_fork_cleanup: if (p->binfmt) module_put(p->binfmt->module); @@ -1185,7 +1186,8 @@ bad_fork_cleanup_count: free_uid(p->user); bad_fork_free: free_task(p); - goto fork_out; +fork_out: + return ERR_PTR(retval); } struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) diff --git a/kernel/futex.c b/kernel/futex.c index 5872e3507f3..5efa2f97803 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -270,7 +270,13 @@ static void wake_futex(struct futex_q *q) /* * The waiting task can free the futex_q as soon as this is written, * without taking any locks. This must come last. + * + * A memory barrier is required here to prevent the following store + * to lock_ptr from getting ahead of the wakeup. Clearing the lock + * at the end of wake_up_all() does not prevent this store from + * moving. */ + wmb(); q->lock_ptr = NULL; } @@ -350,6 +356,13 @@ retry: if (bh1 != bh2) spin_unlock(&bh2->lock); +#ifndef CONFIG_MMU + /* we don't get EFAULT from MMU faults if we don't have an MMU, + * but we might get them from range checking */ + ret = op_ret; + goto out; +#endif + if (unlikely(op_ret != -EFAULT)) { ret = op_ret; goto out; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 81c49a4d679..97d5559997d 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -366,6 +366,8 @@ int request_irq(unsigned int irq, action->next = NULL; action->dev_id = dev_id; + select_smp_affinity(irq); + retval = setup_irq(irq, action); if (retval) kfree(action); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index f26e534c658..d03b5eef8ce 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -10,6 +10,8 @@ #include <linux/proc_fs.h> #include <linux/interrupt.h> +#include "internals.h" + static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; #ifdef CONFIG_SMP @@ -68,7 +70,9 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, */ cpus_and(tmp, new_value, cpu_online_map); if (cpus_empty(tmp)) - return -EINVAL; + /* Special case for empty set - allow the architecture + code to set default SMP affinity. */ + return select_smp_affinity(irq) ? -EINVAL : full_count; proc_set_irq_affinity(irq, new_value); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5beda378cc7..3bb71e63a37 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -246,6 +246,19 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) return ret; } +/* Walks the list and increments nmissed count for multiprobe case */ +void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) +{ + struct kprobe *kp; + if (p->pre_handler != aggr_pre_handler) { + p->nmissed++; + } else { + list_for_each_entry_rcu(kp, &p->list, list) + kp->nmissed++; + } + return; +} + /* Called with kretprobe_lock held */ struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) { @@ -399,10 +412,7 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) INIT_LIST_HEAD(&ap->list); list_add_rcu(&p->list, &ap->list); - INIT_HLIST_NODE(&ap->hlist); - hlist_del_rcu(&p->hlist); - hlist_add_head_rcu(&ap->hlist, - &kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]); + hlist_replace_rcu(&p->hlist, &ap->hlist); } /* @@ -462,9 +472,16 @@ int __kprobes register_kprobe(struct kprobe *p) int ret = 0; unsigned long flags = 0; struct kprobe *old_p; + struct module *mod; + + if ((!kernel_text_address((unsigned long) p->addr)) || + in_kprobes_functions((unsigned long) p->addr)) + return -EINVAL; + + if ((mod = module_text_address((unsigned long) p->addr)) && + (unlikely(!try_module_get(mod)))) + return -EINVAL; - if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0) - return ret; if ((ret = arch_prepare_kprobe(p)) != 0) goto rm_kprobe; @@ -488,6 +505,8 @@ out: rm_kprobe: if (ret == -EEXIST) arch_remove_kprobe(p); + if (ret && mod) + module_put(mod); return ret; } @@ -495,6 +514,7 @@ void __kprobes unregister_kprobe(struct kprobe *p) { unsigned long flags; struct kprobe *old_p; + struct module *mod; spin_lock_irqsave(&kprobe_lock, flags); old_p = get_kprobe(p->addr); @@ -506,6 +526,10 @@ void __kprobes unregister_kprobe(struct kprobe *p) cleanup_kprobe(p, flags); synchronize_sched(); + + if ((mod = module_text_address((unsigned long)p->addr))) + module_put(mod); + if (old_p->pre_handler == aggr_pre_handler && list_empty(&old_p->list)) kfree(old_p); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 015fb69ad94..99af8b05eea 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -15,6 +15,9 @@ #include <linux/module.h> #include <linux/init.h> +u64 uevent_seqnum; +char uevent_helper[UEVENT_HELPER_PATH_LEN] = "/sbin/hotplug"; + #define KERNEL_ATTR_RO(_name) \ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) @@ -23,11 +26,29 @@ static struct subsys_attribute _name##_attr = \ __ATTR(_name, 0644, _name##_show, _name##_store) #ifdef CONFIG_HOTPLUG -static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page) +/* current uevent sequence number */ +static ssize_t uevent_seqnum_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum); +} +KERNEL_ATTR_RO(uevent_seqnum); + +/* uevent helper program, used during early boo */ +static ssize_t uevent_helper_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%s\n", uevent_helper); +} +static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, size_t count) { - return sprintf(page, "%llu\n", (unsigned long long)hotplug_seqnum); + if (count+1 > UEVENT_HELPER_PATH_LEN) + return -ENOENT; + memcpy(uevent_helper, page, count); + uevent_helper[count] = '\0'; + if (count && uevent_helper[count-1] == '\n') + uevent_helper[count-1] = '\0'; + return count; } -KERNEL_ATTR_RO(hotplug_seqnum); +KERNEL_ATTR_RW(uevent_helper); #endif #ifdef CONFIG_KEXEC @@ -45,7 +66,8 @@ EXPORT_SYMBOL_GPL(kernel_subsys); static struct attribute * kernel_attrs[] = { #ifdef CONFIG_HOTPLUG - &hotplug_seqnum_attr.attr, + &uevent_seqnum_attr.attr, + &uevent_helper_attr.attr, #endif #ifdef CONFIG_KEXEC &crash_notes_attr.attr, diff --git a/kernel/module.c b/kernel/module.c index 2ea929d51ad..e4276046a1b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -496,15 +496,15 @@ static void module_unload_free(struct module *mod) } #ifdef CONFIG_MODULE_FORCE_UNLOAD -static inline int try_force(unsigned int flags) +static inline int try_force_unload(unsigned int flags) { int ret = (flags & O_TRUNC); if (ret) - add_taint(TAINT_FORCED_MODULE); + add_taint(TAINT_FORCED_RMMOD); return ret; } #else -static inline int try_force(unsigned int flags) +static inline int try_force_unload(unsigned int flags) { return 0; } @@ -524,7 +524,7 @@ static int __try_stop_module(void *_sref) /* If it's not unused, quit unless we are told to block. */ if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { - if (!(*sref->forced = try_force(sref->flags))) + if (!(*sref->forced = try_force_unload(sref->flags))) return -EWOULDBLOCK; } @@ -609,7 +609,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) /* If it has an init func, it must have an exit func to unload */ if ((mod->init != NULL && mod->exit == NULL) || mod->unsafe) { - forced = try_force(flags); + forced = try_force_unload(flags); if (!forced) { /* This module can't be removed */ ret = -EBUSY; @@ -958,7 +958,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, unsigned long ret; const unsigned long *crc; - spin_lock_irq(&modlist_lock); ret = __find_symbol(name, &owner, &crc, mod->license_gplok); if (ret) { /* use_module can fail due to OOM, or module unloading */ @@ -966,7 +965,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, !use_module(mod, owner)) ret = 0; } - spin_unlock_irq(&modlist_lock); return ret; } @@ -1204,6 +1202,39 @@ void *__symbol_get(const char *symbol) } EXPORT_SYMBOL_GPL(__symbol_get); +/* + * Ensure that an exported symbol [global namespace] does not already exist + * in the Kernel or in some other modules exported symbol table. + */ +static int verify_export_symbols(struct module *mod) +{ + const char *name = NULL; + unsigned long i, ret = 0; + struct module *owner; + const unsigned long *crc; + + for (i = 0; i < mod->num_syms; i++) + if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) { + name = mod->syms[i].name; + ret = -ENOEXEC; + goto dup; + } + + for (i = 0; i < mod->num_gpl_syms; i++) + if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) { + name = mod->gpl_syms[i].name; + ret = -ENOEXEC; + goto dup; + } + +dup: + if (ret) + printk(KERN_ERR "%s: exports duplicate symbol %s (owned by %s)\n", + mod->name, name, module_name(owner)); + + return ret; +} + /* Change all symbols so that sh_value encodes the pointer directly. */ static int simplify_symbols(Elf_Shdr *sechdrs, unsigned int symindex, @@ -1715,6 +1746,11 @@ static struct module *load_module(void __user *umod, /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); + if (strcmp(mod->name, "ndiswrapper") == 0) + add_taint(TAINT_PROPRIETARY_MODULE); + if (strcmp(mod->name, "driverloader") == 0) + add_taint(TAINT_PROPRIETARY_MODULE); + #ifdef CONFIG_MODULE_UNLOAD /* Set up MODINFO_ATTR fields */ setup_modinfo(mod, sechdrs, infoindex); @@ -1767,6 +1803,12 @@ static struct module *load_module(void __user *umod, goto cleanup; } + /* Find duplicate symbols */ + err = verify_export_symbols(mod); + + if (err < 0) + goto cleanup; + /* Set up and sort exception table */ mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); mod->extable = extable = (void *)sechdrs[exindex].sh_addr; @@ -1854,8 +1896,7 @@ static struct module *load_module(void __user *umod, kfree(args); free_hdr: vfree(hdr); - if (err < 0) return ERR_PTR(err); - else return ptr; + return ERR_PTR(err); truncated: printk(KERN_ERR "Module len %lu truncated\n", len); diff --git a/kernel/panic.c b/kernel/panic.c index aabc5f86fa3..c5c4ab25583 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -60,7 +60,7 @@ NORET_TYPE void panic(const char * fmt, ...) long i; static char buf[1024]; va_list args; -#if defined(CONFIG_ARCH_S390) +#if defined(CONFIG_S390) unsigned long caller = (unsigned long) __builtin_return_address(0); #endif @@ -125,7 +125,7 @@ NORET_TYPE void panic(const char * fmt, ...) printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); } #endif -#if defined(CONFIG_ARCH_S390) +#if defined(CONFIG_S390) disabled_wait(caller); #endif local_irq_enable(); diff --git a/kernel/params.c b/kernel/params.c index 47ba6954794..c76ad25e6a2 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -619,7 +619,7 @@ static void __init param_sysfs_builtin(void) /* module-related sysfs stuff */ -#ifdef CONFIG_MODULES +#ifdef CONFIG_SYSFS #define to_module_attr(n) container_of(n, struct module_attribute, attr); #define to_module_kobject(n) container_of(n, struct module_kobject, kobj); diff --git a/kernel/pid.c b/kernel/pid.c index edba31c681a..1acc0724699 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_type type, int nr) struct hlist_node *elem; struct pid *pid; - hlist_for_each_entry(pid, elem, + hlist_for_each_entry_rcu(pid, elem, &pid_hash[type][pid_hashfn(nr)], pid_chain) { if (pid->nr == nr) return pid; @@ -150,15 +150,15 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr) task_pid = &task->pids[type]; pid = find_pid(type, nr); + task_pid->nr = nr; if (pid == NULL) { - hlist_add_head(&task_pid->pid_chain, - &pid_hash[type][pid_hashfn(nr)]); INIT_LIST_HEAD(&task_pid->pid_list); + hlist_add_head_rcu(&task_pid->pid_chain, + &pid_hash[type][pid_hashfn(nr)]); } else { INIT_HLIST_NODE(&task_pid->pid_chain); - list_add_tail(&task_pid->pid_list, &pid->pid_list); + list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list); } - task_pid->nr = nr; return 0; } @@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t *task, enum pid_type type) pid = &task->pids[type]; if (!hlist_unhashed(&pid->pid_chain)) { - hlist_del(&pid->pid_chain); - if (list_empty(&pid->pid_list)) + if (list_empty(&pid->pid_list)) { nr = pid->nr; - else { + hlist_del_rcu(&pid->pid_chain); + } else { pid_next = list_entry(pid->pid_list.next, struct pid, pid_list); /* insert next pid from pid_list to hash */ - hlist_add_head(&pid_next->pid_chain, - &pid_hash[type][pid_hashfn(pid_next->nr)]); + hlist_replace_rcu(&pid->pid_chain, + &pid_next->pid_chain); } } - list_del(&pid->pid_list); + list_del_rcu(&pid->pid_list); pid->nr = 0; return nr; diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 84af54c39e1..4c68edff900 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -36,7 +36,7 @@ timespec_to_sample(clockid_t which_clock, const struct timespec *tp) union cpu_time_count ret; ret.sched = 0; /* high half always zero when .cpu used */ if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { - ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; + ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; } else { ret.cpu = timespec_to_cputime(tp); } @@ -238,18 +238,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx, while ((t = next_thread(t)) != p) { cpu->sched += t->sched_time; } - if (p->tgid == current->tgid) { - /* - * We're sampling ourselves, so include the - * cycles not yet banked. We still omit - * other threads running on other CPUs, - * so the total can always be behind as - * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ). - */ - cpu->sched += current_sched_time(current); - } else { - cpu->sched += p->sched_time; - } + cpu->sched += sched_ns(p); break; } return 0; diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 027322a564f..e24446f8d8c 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -24,10 +24,11 @@ extern suspend_disk_method_t pm_disk_mode; +extern int swsusp_shrink_memory(void); extern int swsusp_suspend(void); -extern int swsusp_write(void); +extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages); extern int swsusp_check(void); -extern int swsusp_read(void); +extern int swsusp_read(struct pbe **pblist_ptr); extern void swsusp_close(void); extern int swsusp_resume(void); @@ -73,31 +74,6 @@ static void power_down(suspend_disk_method_t mode) static int in_suspend __nosavedata = 0; -/** - * free_some_memory - Try to free as much memory as possible - * - * ... but do not OOM-kill anyone - * - * Notice: all userland should be stopped at this point, or - * livelock is possible. - */ - -static void free_some_memory(void) -{ - unsigned int i = 0; - unsigned int tmp; - unsigned long pages = 0; - char *p = "-\\|/"; - - printk("Freeing memory... "); - while ((tmp = shrink_all_memory(10000))) { - pages += tmp; - printk("\b%c", p[i++ % 4]); - } - printk("\bdone (%li pages freed)\n", pages); -} - - static inline void platform_finish(void) { if (pm_disk_mode == PM_DISK_PLATFORM) { @@ -127,8 +103,8 @@ static int prepare_processes(void) } /* Free memory before shutting down devices. */ - free_some_memory(); - return 0; + if (!(error = swsusp_shrink_memory())) + return 0; thaw: thaw_processes(); enable_nonboot_cpus(); @@ -176,7 +152,7 @@ int pm_suspend_disk(void) if (in_suspend) { device_resume(); pr_debug("PM: writing image.\n"); - error = swsusp_write(); + error = swsusp_write(pagedir_nosave, nr_copy_pages); if (!error) power_down(pm_disk_mode); else { @@ -247,7 +223,7 @@ static int software_resume(void) pr_debug("PM: Reading swsusp image.\n"); - if ((error = swsusp_read())) { + if ((error = swsusp_read(&pagedir_nosave))) { swsusp_free(); goto Thaw; } @@ -363,37 +339,55 @@ static ssize_t resume_show(struct subsystem * subsys, char *buf) MINOR(swsusp_resume_device)); } -static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t n) +static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) { - int len; - char *p; unsigned int maj, min; - int error = -EINVAL; dev_t res; + int ret = -EINVAL; - p = memchr(buf, '\n', n); - len = p ? p - buf : n; + if (sscanf(buf, "%u:%u", &maj, &min) != 2) + goto out; - if (sscanf(buf, "%u:%u", &maj, &min) == 2) { - res = MKDEV(maj,min); - if (maj == MAJOR(res) && min == MINOR(res)) { - down(&pm_sem); - swsusp_resume_device = res; - up(&pm_sem); - printk("Attempting manual resume\n"); - noresume = 0; - software_resume(); - } - } + res = MKDEV(maj,min); + if (maj != MAJOR(res) || min != MINOR(res)) + goto out; - return error >= 0 ? n : error; + down(&pm_sem); + swsusp_resume_device = res; + up(&pm_sem); + printk("Attempting manual resume\n"); + noresume = 0; + software_resume(); + ret = n; +out: + return ret; } power_attr(resume); +static ssize_t image_size_show(struct subsystem * subsys, char *buf) +{ + return sprintf(buf, "%u\n", image_size); +} + +static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n) +{ + unsigned int size; + + if (sscanf(buf, "%u", &size) == 1) { + image_size = size; + return n; + } + + return -EINVAL; +} + +power_attr(image_size); + static struct attribute * g[] = { &disk_attr.attr, &resume_attr.attr, + &image_size_attr.attr, NULL, }; diff --git a/kernel/power/main.c b/kernel/power/main.c index 6ee2cad530e..d253f3ae2fa 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -24,7 +24,7 @@ DECLARE_MUTEX(pm_sem); -struct pm_ops * pm_ops = NULL; +struct pm_ops *pm_ops; suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; /** @@ -151,6 +151,18 @@ static char *pm_states[PM_SUSPEND_MAX] = { #endif }; +static inline int valid_state(suspend_state_t state) +{ + /* Suspend-to-disk does not really need low-level support. + * It can work with reboot if needed. */ + if (state == PM_SUSPEND_DISK) + return 1; + + if (pm_ops && pm_ops->valid && !pm_ops->valid(state)) + return 0; + return 1; +} + /** * enter_state - Do common work of entering low-power state. @@ -167,7 +179,7 @@ static int enter_state(suspend_state_t state) { int error; - if (pm_ops && pm_ops->valid && !pm_ops->valid(state)) + if (!valid_state(state)) return -ENODEV; if (down_trylock(&pm_sem)) return -EBUSY; @@ -238,9 +250,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) char * s = buf; for (i = 0; i < PM_SUSPEND_MAX; i++) { - if (pm_states[i] && pm_ops && (!pm_ops->valid - ||(pm_ops->valid && pm_ops->valid(i)))) - s += sprintf(s,"%s ",pm_states[i]); + if (pm_states[i] && valid_state(i)) + s += sprintf(s,"%s ", pm_states[i]); } s += sprintf(s,"\n"); return (s - buf); diff --git a/kernel/power/power.h b/kernel/power/power.h index 6c042b5ee14..7e8492fd142 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -9,19 +9,13 @@ #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) #endif -#define MAX_PBES ((PAGE_SIZE - sizeof(struct new_utsname) \ - - 4 - 3*sizeof(unsigned long) - sizeof(int) \ - - sizeof(void *)) / sizeof(swp_entry_t)) - struct swsusp_info { struct new_utsname uts; u32 version_code; unsigned long num_physpages; int cpus; unsigned long image_pages; - unsigned long pagedir_pages; - suspend_pagedir_t * suspend_pagedir; - swp_entry_t pagedir[MAX_PBES]; + unsigned long pages; } __attribute__((aligned(PAGE_SIZE))); @@ -48,25 +42,27 @@ static struct subsys_attribute _name##_attr = { \ extern struct subsystem power_subsys; -extern int freeze_processes(void); -extern void thaw_processes(void); - extern int pm_prepare_console(void); extern void pm_restore_console(void); - /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; extern unsigned int nr_copy_pages; -extern suspend_pagedir_t *pagedir_nosave; -extern suspend_pagedir_t *pagedir_save; +extern struct pbe *pagedir_nosave; + +/* Preferred image size in MB (default 500) */ +extern unsigned int image_size; extern asmlinkage int swsusp_arch_suspend(void); extern asmlinkage int swsusp_arch_resume(void); +extern unsigned int count_data_pages(void); extern void free_pagedir(struct pbe *pblist); +extern void release_eaten_pages(void); extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed); -extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages); extern void swsusp_free(void); extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed); +extern unsigned int snapshot_nr_pages(void); +extern struct pbe *snapshot_pblist(void); +extern void snapshot_pblist_set(struct pbe *pblist); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 4a6dbcefd37..41f66365f0d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -33,7 +33,35 @@ #include "power.h" +struct pbe *pagedir_nosave; +unsigned int nr_copy_pages; + #ifdef CONFIG_HIGHMEM +unsigned int count_highmem_pages(void) +{ + struct zone *zone; + unsigned long zone_pfn; + unsigned int n = 0; + + for_each_zone (zone) + if (is_highmem(zone)) { + mark_free_pages(zone); + for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) { + struct page *page; + unsigned long pfn = zone_pfn + zone->zone_start_pfn; + if (!pfn_valid(pfn)) + continue; + page = pfn_to_page(pfn); + if (PageReserved(page)) + continue; + if (PageNosaveFree(page)) + continue; + n++; + } + } + return n; +} + struct highmem_page { char *data; struct page *page; @@ -149,17 +177,15 @@ static int saveable(struct zone *zone, unsigned long *zone_pfn) BUG_ON(PageReserved(page) && PageNosave(page)); if (PageNosave(page)) return 0; - if (PageReserved(page) && pfn_is_nosave(pfn)) { - pr_debug("[nosave pfn 0x%lx]", pfn); + if (PageReserved(page) && pfn_is_nosave(pfn)) return 0; - } if (PageNosaveFree(page)) return 0; return 1; } -static unsigned count_data_pages(void) +unsigned int count_data_pages(void) { struct zone *zone; unsigned long zone_pfn; @@ -244,7 +270,7 @@ static inline void fill_pb_page(struct pbe *pbpage) * of memory pages allocated with alloc_pagedir() */ -void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) +static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) { struct pbe *pbpage, *p; unsigned int num = PBES_PER_PAGE; @@ -261,7 +287,35 @@ void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) p->next = p + 1; p->next = NULL; } - pr_debug("create_pbe_list(): initialized %d PBEs\n", num); +} + +/** + * On resume it is necessary to trace and eventually free the unsafe + * pages that have been allocated, because they are needed for I/O + * (on x86-64 we likely will "eat" these pages once again while + * creating the temporary page translation tables) + */ + +struct eaten_page { + struct eaten_page *next; + char padding[PAGE_SIZE - sizeof(void *)]; +}; + +static struct eaten_page *eaten_pages = NULL; + +void release_eaten_pages(void) +{ + struct eaten_page *p, *q; + + p = eaten_pages; + while (p) { + q = p->next; + /* We don't want swsusp_free() to free this page again */ + ClearPageNosave(virt_to_page(p)); + free_page((unsigned long)p); + p = q; + } + eaten_pages = NULL; } /** @@ -282,9 +336,12 @@ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) if (safe_needed) do { res = (void *)get_zeroed_page(gfp_mask); - if (res && PageNosaveFree(virt_to_page(res))) + if (res && PageNosaveFree(virt_to_page(res))) { /* This is for swsusp_free() */ SetPageNosave(virt_to_page(res)); + ((struct eaten_page *)res)->next = eaten_pages; + eaten_pages = res; + } } while (res && PageNosaveFree(virt_to_page(res))); else res = (void *)get_zeroed_page(gfp_mask); @@ -332,7 +389,8 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed if (!pbe) { /* get_zeroed_page() failed */ free_pagedir(pblist); pblist = NULL; - } + } else + create_pbe_list(pblist, nr_pages); return pblist; } @@ -370,8 +428,14 @@ void swsusp_free(void) static int enough_free_mem(unsigned int nr_pages) { - pr_debug("swsusp: available memory: %u pages\n", nr_free_pages()); - return nr_free_pages() > (nr_pages + PAGES_FOR_IO + + struct zone *zone; + unsigned int n = 0; + + for_each_zone (zone) + if (!is_highmem(zone)) + n += zone->free_pages; + pr_debug("swsusp: available memory: %u pages\n", n); + return n > (nr_pages + PAGES_FOR_IO + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); } @@ -395,7 +459,6 @@ static struct pbe *swsusp_alloc(unsigned int nr_pages) printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); return NULL; } - create_pbe_list(pblist, nr_pages); if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { printk(KERN_ERR "suspend: Allocating image pages failed.\n"); @@ -421,10 +484,6 @@ asmlinkage int swsusp_save(void) (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE, PAGES_FOR_IO, nr_free_pages()); - /* This is needed because of the fixed size of swsusp_info */ - if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE) - return -ENOSPC; - if (!enough_free_mem(nr_pages)) { printk(KERN_ERR "swsusp: Not enough free memory\n"); return -ENOMEM; diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index c05f46e7348..55a18d26abe 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -30,8 +30,8 @@ * Alex Badea <vampire@go.ro>: * Fixed runaway init * - * Andreas Steinmetz <ast@domdv.de>: - * Added encrypted suspend option + * Rafael J. Wysocki <rjw@sisk.pl> + * Added the swap map data structure and reworked the handling of swap * * More state savers are welcome. Especially for the scsi layer... * @@ -67,44 +67,33 @@ #include <asm/tlbflush.h> #include <asm/io.h> -#include <linux/random.h> -#include <linux/crypto.h> -#include <asm/scatterlist.h> - #include "power.h" +/* + * Preferred image size in MB (tunable via /sys/power/image_size). + * When it is set to N, swsusp will do its best to ensure the image + * size will not exceed N MB, but if that is impossible, it will + * try to create the smallest image possible. + */ +unsigned int image_size = 500; + #ifdef CONFIG_HIGHMEM +unsigned int count_highmem_pages(void); int save_highmem(void); int restore_highmem(void); #else static int save_highmem(void) { return 0; } static int restore_highmem(void) { return 0; } +static unsigned int count_highmem_pages(void) { return 0; } #endif -#define CIPHER "aes" -#define MAXKEY 32 -#define MAXIV 32 - extern char resume_file[]; -/* Local variables that should not be affected by save */ -unsigned int nr_copy_pages __nosavedata = 0; - -/* Suspend pagedir is allocated before final copy, therefore it - must be freed after resume - - Warning: this is even more evil than it seems. Pagedirs this file - talks about are completely different from page directories used by - MMU hardware. - */ -suspend_pagedir_t *pagedir_nosave __nosavedata = NULL; - #define SWSUSP_SIG "S1SUSPEND" static struct swsusp_header { - char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)]; - u8 key_iv[MAXKEY+MAXIV]; - swp_entry_t swsusp_info; + char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; + swp_entry_t image; char orig_sig[10]; char sig[10]; } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; @@ -115,140 +104,9 @@ static struct swsusp_info swsusp_info; * Saving part... */ -/* We memorize in swapfile_used what swap devices are used for suspension */ -#define SWAPFILE_UNUSED 0 -#define SWAPFILE_SUSPEND 1 /* This is the suspending device */ -#define SWAPFILE_IGNORED 2 /* Those are other swap devices ignored for suspension */ - -static unsigned short swapfile_used[MAX_SWAPFILES]; -static unsigned short root_swap; - -static int write_page(unsigned long addr, swp_entry_t *loc); -static int bio_read_page(pgoff_t page_off, void *page); - -static u8 key_iv[MAXKEY+MAXIV]; - -#ifdef CONFIG_SWSUSP_ENCRYPT - -static int crypto_init(int mode, void **mem) -{ - int error = 0; - int len; - char *modemsg; - struct crypto_tfm *tfm; - - modemsg = mode ? "suspend not possible" : "resume not possible"; - - tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC); - if(!tfm) { - printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg); - error = -EINVAL; - goto out; - } - - if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) { - printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg); - error = -ENOKEY; - goto fail; - } - - if (mode) - get_random_bytes(key_iv, MAXKEY+MAXIV); - - len = crypto_tfm_alg_max_keysize(tfm); - if (len > MAXKEY) - len = MAXKEY; - - if (crypto_cipher_setkey(tfm, key_iv, len)) { - printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg); - error = -EKEYREJECTED; - goto fail; - } - - len = crypto_tfm_alg_ivsize(tfm); - - if (MAXIV < len) { - printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg); - error = -EOVERFLOW; - goto fail; - } - - crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len); - - *mem=(void *)tfm; - - goto out; - -fail: crypto_free_tfm(tfm); -out: return error; -} - -static __inline__ void crypto_exit(void *mem) -{ - crypto_free_tfm((struct crypto_tfm *)mem); -} - -static __inline__ int crypto_write(struct pbe *p, void *mem) -{ - int error = 0; - struct scatterlist src, dst; - - src.page = virt_to_page(p->address); - src.offset = 0; - src.length = PAGE_SIZE; - dst.page = virt_to_page((void *)&swsusp_header); - dst.offset = 0; - dst.length = PAGE_SIZE; - - error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src, - PAGE_SIZE); - - if (!error) - error = write_page((unsigned long)&swsusp_header, - &(p->swap_address)); - return error; -} - -static __inline__ int crypto_read(struct pbe *p, void *mem) -{ - int error = 0; - struct scatterlist src, dst; - - error = bio_read_page(swp_offset(p->swap_address), (void *)p->address); - if (!error) { - src.offset = 0; - src.length = PAGE_SIZE; - dst.offset = 0; - dst.length = PAGE_SIZE; - src.page = dst.page = virt_to_page((void *)p->address); - - error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst, - &src, PAGE_SIZE); - } - return error; -} -#else -static __inline__ int crypto_init(int mode, void *mem) -{ - return 0; -} - -static __inline__ void crypto_exit(void *mem) -{ -} - -static __inline__ int crypto_write(struct pbe *p, void *mem) -{ - return write_page(p->address, &(p->swap_address)); -} +static unsigned short root_swap = 0xffff; -static __inline__ int crypto_read(struct pbe *p, void *mem) -{ - return bio_read_page(swp_offset(p->swap_address), (void *)p->address); -} -#endif - -static int mark_swapfiles(swp_entry_t prev) +static int mark_swapfiles(swp_entry_t start) { int error; @@ -259,8 +117,7 @@ static int mark_swapfiles(swp_entry_t prev) !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); memcpy(swsusp_header.sig,SWSUSP_SIG, 10); - memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV); - swsusp_header.swsusp_info = prev; + swsusp_header.image = start; error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), virt_to_page((unsigned long) @@ -283,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev) * devfs, since the resume code can only recognize the form /dev/hda4, * but the suspend code would see the long name.) */ -static int is_resume_device(const struct swap_info_struct *swap_info) +static inline int is_resume_device(const struct swap_info_struct *swap_info) { struct file *file = swap_info->swap_file; struct inode *inode = file->f_dentry->d_inode; @@ -294,54 +151,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info) static int swsusp_swap_check(void) /* This is called before saving image */ { - int i, len; - - len=strlen(resume_file); - root_swap = 0xFFFF; - - spin_lock(&swap_lock); - for (i=0; i<MAX_SWAPFILES; i++) { - if (!(swap_info[i].flags & SWP_WRITEOK)) { - swapfile_used[i]=SWAPFILE_UNUSED; - } else { - if (!len) { - printk(KERN_WARNING "resume= option should be used to set suspend device" ); - if (root_swap == 0xFFFF) { - swapfile_used[i] = SWAPFILE_SUSPEND; - root_swap = i; - } else - swapfile_used[i] = SWAPFILE_IGNORED; - } else { - /* we ignore all swap devices that are not the resume_file */ - if (is_resume_device(&swap_info[i])) { - swapfile_used[i] = SWAPFILE_SUSPEND; - root_swap = i; - } else { - swapfile_used[i] = SWAPFILE_IGNORED; - } - } - } - } - spin_unlock(&swap_lock); - return (root_swap != 0xffff) ? 0 : -ENODEV; -} - -/** - * This is called after saving image so modification - * will be lost after resume... and that's what we want. - * we make the device unusable. A new call to - * lock_swapdevices can unlock the devices. - */ -static void lock_swapdevices(void) -{ int i; + if (!swsusp_resume_device) + return -ENODEV; spin_lock(&swap_lock); - for (i = 0; i< MAX_SWAPFILES; i++) - if (swapfile_used[i] == SWAPFILE_IGNORED) { - swap_info[i].flags ^= SWP_WRITEOK; + for (i = 0; i < MAX_SWAPFILES; i++) { + if (!(swap_info[i].flags & SWP_WRITEOK)) + continue; + if (is_resume_device(swap_info + i)) { + spin_unlock(&swap_lock); + root_swap = i; + return 0; } + } spin_unlock(&swap_lock); + return -ENODEV; } /** @@ -359,72 +184,217 @@ static void lock_swapdevices(void) static int write_page(unsigned long addr, swp_entry_t *loc) { swp_entry_t entry; - int error = 0; + int error = -ENOSPC; - entry = get_swap_page(); - if (swp_offset(entry) && - swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) { - error = rw_swap_page_sync(WRITE, entry, - virt_to_page(addr)); - if (error == -EIO) - error = 0; - if (!error) + entry = get_swap_page_of_type(root_swap); + if (swp_offset(entry)) { + error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr)); + if (!error || error == -EIO) *loc = entry; - } else - error = -ENOSPC; + } return error; } /** - * data_free - Free the swap entries used by the saved image. + * Swap map-handling functions + * + * The swap map is a data structure used for keeping track of each page + * written to the swap. It consists of many swap_map_page structures + * that contain each an array of MAP_PAGE_SIZE swap entries. + * These structures are linked together with the help of either the + * .next (in memory) or the .next_swap (in swap) member. * - * Walk the list of used swap entries and free each one. - * This is only used for cleanup when suspend fails. + * The swap map is created during suspend. At that time we need to keep + * it in memory, because we have to free all of the allocated swap + * entries if an error occurs. The memory needed is preallocated + * so that we know in advance if there's enough of it. + * + * The first swap_map_page structure is filled with the swap entries that + * correspond to the first MAP_PAGE_SIZE data pages written to swap and + * so on. After the all of the data pages have been written, the order + * of the swap_map_page structures in the map is reversed so that they + * can be read from swap in the original order. This causes the data + * pages to be loaded in exactly the same order in which they have been + * saved. + * + * During resume we only need to use one swap_map_page structure + * at a time, which means that we only need to use two memory pages for + * reading the image - one for reading the swap_map_page structures + * and the second for reading the data pages from swap. */ -static void data_free(void) + +#define MAP_PAGE_SIZE ((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \ + / sizeof(swp_entry_t)) + +struct swap_map_page { + swp_entry_t entries[MAP_PAGE_SIZE]; + swp_entry_t next_swap; + struct swap_map_page *next; +}; + +static inline void free_swap_map(struct swap_map_page *swap_map) { - swp_entry_t entry; - struct pbe *p; + struct swap_map_page *swp; - for_each_pbe (p, pagedir_nosave) { - entry = p->swap_address; - if (entry.val) - swap_free(entry); - else - break; + while (swap_map) { + swp = swap_map->next; + free_page((unsigned long)swap_map); + swap_map = swp; } } +static struct swap_map_page *alloc_swap_map(unsigned int nr_pages) +{ + struct swap_map_page *swap_map, *swp; + unsigned n = 0; + + if (!nr_pages) + return NULL; + + pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages); + swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + swp = swap_map; + for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) { + swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + swp = swp->next; + if (!swp) { + free_swap_map(swap_map); + return NULL; + } + } + return swap_map; +} + /** - * data_write - Write saved image to swap. - * - * Walk the list of pages in the image and sync each one to swap. + * reverse_swap_map - reverse the order of pages in the swap map + * @swap_map */ -static int data_write(void) + +static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map) { - int error = 0, i = 0; - unsigned int mod = nr_copy_pages / 100; - struct pbe *p; - void *tfm; + struct swap_map_page *prev, *next; + + prev = NULL; + while (swap_map) { + next = swap_map->next; + swap_map->next = prev; + prev = swap_map; + swap_map = next; + } + return prev; +} - if ((error = crypto_init(1, &tfm))) - return error; +/** + * free_swap_map_entries - free the swap entries allocated to store + * the swap map @swap_map (this is only called in case of an error) + */ +static inline void free_swap_map_entries(struct swap_map_page *swap_map) +{ + while (swap_map) { + if (swap_map->next_swap.val) + swap_free(swap_map->next_swap); + swap_map = swap_map->next; + } +} - if (!mod) - mod = 1; +/** + * save_swap_map - save the swap map used for tracing the data pages + * stored in the swap + */ - printk( "Writing data to swap (%d pages)... ", nr_copy_pages ); - for_each_pbe (p, pagedir_nosave) { - if (!(i%mod)) - printk( "\b\b\b\b%3d%%", i / mod ); - if ((error = crypto_write(p, tfm))) { - crypto_exit(tfm); +static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start) +{ + swp_entry_t entry = (swp_entry_t){0}; + int error; + + while (swap_map) { + swap_map->next_swap = entry; + if ((error = write_page((unsigned long)swap_map, &entry))) return error; - } - i++; + swap_map = swap_map->next; } - printk("\b\b\b\bdone\n"); - crypto_exit(tfm); + *start = entry; + return 0; +} + +/** + * free_image_entries - free the swap entries allocated to store + * the image data pages (this is only called in case of an error) + */ + +static inline void free_image_entries(struct swap_map_page *swp) +{ + unsigned k; + + while (swp) { + for (k = 0; k < MAP_PAGE_SIZE; k++) + if (swp->entries[k].val) + swap_free(swp->entries[k]); + swp = swp->next; + } +} + +/** + * The swap_map_handle structure is used for handling the swap map in + * a file-alike way + */ + +struct swap_map_handle { + struct swap_map_page *cur; + unsigned int k; +}; + +static inline void init_swap_map_handle(struct swap_map_handle *handle, + struct swap_map_page *map) +{ + handle->cur = map; + handle->k = 0; +} + +static inline int swap_map_write_page(struct swap_map_handle *handle, + unsigned long addr) +{ + int error; + + error = write_page(addr, handle->cur->entries + handle->k); + if (error) + return error; + if (++handle->k >= MAP_PAGE_SIZE) { + handle->cur = handle->cur->next; + handle->k = 0; + } + return 0; +} + +/** + * save_image_data - save the data pages pointed to by the PBEs + * from the list @pblist using the swap map handle @handle + * (assume there are @nr_pages data pages to save) + */ + +static int save_image_data(struct pbe *pblist, + struct swap_map_handle *handle, + unsigned int nr_pages) +{ + unsigned int m; + struct pbe *p; + int error = 0; + + printk("Saving image data pages (%u pages) ... ", nr_pages); + m = nr_pages / 100; + if (!m) + m = 1; + nr_pages = 0; + for_each_pbe (p, pblist) { + error = swap_map_write_page(handle, p->address); + if (error) + break; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + if (!error) + printk("\b\b\b\bdone\n"); return error; } @@ -440,70 +410,70 @@ static void dump_info(void) pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname); pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus); pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages); - pr_debug(" swsusp: Pagedir: %ld Pages\n",swsusp_info.pagedir_pages); + pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages); } -static void init_header(void) +static void init_header(unsigned int nr_pages) { memset(&swsusp_info, 0, sizeof(swsusp_info)); swsusp_info.version_code = LINUX_VERSION_CODE; swsusp_info.num_physpages = num_physpages; memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname)); - swsusp_info.suspend_pagedir = pagedir_nosave; swsusp_info.cpus = num_online_cpus(); - swsusp_info.image_pages = nr_copy_pages; -} - -static int close_swap(void) -{ - swp_entry_t entry; - int error; - - dump_info(); - error = write_page((unsigned long)&swsusp_info, &entry); - if (!error) { - printk( "S" ); - error = mark_swapfiles(entry); - printk( "|\n" ); - } - return error; + swsusp_info.image_pages = nr_pages; + swsusp_info.pages = nr_pages + + ((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; } /** - * free_pagedir_entries - Free pages used by the page directory. - * - * This is used during suspend for error recovery. + * pack_orig_addresses - the .orig_address fields of the PBEs from the + * list starting at @pbe are stored in the array @buf[] (1 page) */ -static void free_pagedir_entries(void) +static inline struct pbe *pack_orig_addresses(unsigned long *buf, + struct pbe *pbe) { - int i; + int j; - for (i = 0; i < swsusp_info.pagedir_pages; i++) - swap_free(swsusp_info.pagedir[i]); + for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { + buf[j] = pbe->orig_address; + pbe = pbe->next; + } + if (!pbe) + for (; j < PAGE_SIZE / sizeof(long); j++) + buf[j] = 0; + return pbe; } - /** - * write_pagedir - Write the array of pages holding the page directory. - * @last: Last swap entry we write (needed for header). + * save_image_metadata - save the .orig_address fields of the PBEs + * from the list @pblist using the swap map handle @handle */ -static int write_pagedir(void) +static int save_image_metadata(struct pbe *pblist, + struct swap_map_handle *handle) { - int error = 0; + unsigned long *buf; unsigned int n = 0; - struct pbe *pbe; + struct pbe *p; + int error = 0; - printk( "Writing pagedir..."); - for_each_pb_page (pbe, pagedir_nosave) { - if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++]))) - return error; + printk("Saving image metadata ... "); + buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); + if (!buf) + return -ENOMEM; + p = pblist; + while (p) { + p = pack_orig_addresses(buf, p); + error = swap_map_write_page(handle, (unsigned long)buf); + if (error) + break; + n++; } - - swsusp_info.pagedir_pages = n; - printk("done (%u pages)\n", n); + free_page((unsigned long)buf); + if (!error) + printk("done (%u pages saved)\n", n); return error; } @@ -511,75 +481,125 @@ static int write_pagedir(void) * enough_swap - Make sure we have enough swap to save the image. * * Returns TRUE or FALSE after checking the total amount of swap - * space avaiable. - * - * FIXME: si_swapinfo(&i) returns all swap devices information. - * We should only consider resume_device. + * space avaiable from the resume partition. */ static int enough_swap(unsigned int nr_pages) { - struct sysinfo i; + unsigned int free_swap = swap_info[root_swap].pages - + swap_info[root_swap].inuse_pages; - si_swapinfo(&i); - pr_debug("swsusp: available swap: %lu pages\n", i.freeswap); - return i.freeswap > (nr_pages + PAGES_FOR_IO + + pr_debug("swsusp: free swap pages: %u\n", free_swap); + return free_swap > (nr_pages + PAGES_FOR_IO + (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE); } /** - * write_suspend_image - Write entire image and metadata. + * swsusp_write - Write entire image and metadata. * + * It is important _NOT_ to umount filesystems at this point. We want + * them synced (in case something goes wrong) but we DO not want to mark + * filesystem clean: it is not. (And it does not matter, if we resume + * correctly, we'll mark system clean, anyway.) */ -static int write_suspend_image(void) + +int swsusp_write(struct pbe *pblist, unsigned int nr_pages) { + struct swap_map_page *swap_map; + struct swap_map_handle handle; + swp_entry_t start; int error; - if (!enough_swap(nr_copy_pages)) { + if ((error = swsusp_swap_check())) { + printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); + return error; + } + if (!enough_swap(nr_pages)) { printk(KERN_ERR "swsusp: Not enough free swap\n"); return -ENOSPC; } - init_header(); - if ((error = data_write())) - goto FreeData; + init_header(nr_pages); + swap_map = alloc_swap_map(swsusp_info.pages); + if (!swap_map) + return -ENOMEM; + init_swap_map_handle(&handle, swap_map); + + error = swap_map_write_page(&handle, (unsigned long)&swsusp_info); + if (!error) + error = save_image_metadata(pblist, &handle); + if (!error) + error = save_image_data(pblist, &handle, nr_pages); + if (error) + goto Free_image_entries; - if ((error = write_pagedir())) - goto FreePagedir; + swap_map = reverse_swap_map(swap_map); + error = save_swap_map(swap_map, &start); + if (error) + goto Free_map_entries; - if ((error = close_swap())) - goto FreePagedir; - Done: - memset(key_iv, 0, MAXKEY+MAXIV); + dump_info(); + printk( "S" ); + error = mark_swapfiles(start); + printk( "|\n" ); + if (error) + goto Free_map_entries; + +Free_swap_map: + free_swap_map(swap_map); return error; - FreePagedir: - free_pagedir_entries(); - FreeData: - data_free(); - goto Done; + +Free_map_entries: + free_swap_map_entries(swap_map); +Free_image_entries: + free_image_entries(swap_map); + goto Free_swap_map; } -/* It is important _NOT_ to umount filesystems at this point. We want - * them synced (in case something goes wrong) but we DO not want to mark - * filesystem clean: it is not. (And it does not matter, if we resume - * correctly, we'll mark system clean, anyway.) +/** + * swsusp_shrink_memory - Try to free as much memory as needed + * + * ... but do not OOM-kill anyone + * + * Notice: all userland should be stopped before it is called, or + * livelock is possible. */ -int swsusp_write(void) -{ - int error; - if ((error = swsusp_swap_check())) { - printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); - return error; - } - lock_swapdevices(); - error = write_suspend_image(); - /* This will unlock ignored swap devices since writing is finished */ - lock_swapdevices(); - return error; -} +#define SHRINK_BITE 10000 +int swsusp_shrink_memory(void) +{ + long size, tmp; + struct zone *zone; + unsigned long pages = 0; + unsigned int i = 0; + char *p = "-\\|/"; + + printk("Shrinking memory... "); + do { + size = 2 * count_highmem_pages(); + size += size / 50 + count_data_pages(); + size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE + + PAGES_FOR_IO; + tmp = size; + for_each_zone (zone) + if (!is_highmem(zone)) + tmp -= zone->free_pages; + if (tmp > 0) { + tmp = shrink_all_memory(SHRINK_BITE); + if (!tmp) + return -ENOMEM; + pages += tmp; + } else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) { + tmp = shrink_all_memory(SHRINK_BITE); + pages += tmp; + } + printk("\b%c", p[i++%4]); + } while (tmp > 0); + printk("\bdone (%lu pages freed)\n", pages); + return 0; +} int swsusp_suspend(void) { @@ -677,7 +697,6 @@ static void copy_page_backup_list(struct pbe *dst, struct pbe *src) /* We assume both lists contain the same number of elements */ while (src) { dst->orig_address = src->orig_address; - dst->swap_address = src->swap_address; dst = dst->next; src = src->next; } @@ -757,198 +776,224 @@ static int bio_write_page(pgoff_t page_off, void *page) return submit(WRITE, page_off, page); } -/* - * Sanity check if this image makes sense with this kernel/swap context - * I really don't think that it's foolproof but more than nothing.. +/** + * The following functions allow us to read data using a swap map + * in a file-alike way */ -static const char *sanity_check(void) +static inline void release_swap_map_reader(struct swap_map_handle *handle) { - dump_info(); - if (swsusp_info.version_code != LINUX_VERSION_CODE) - return "kernel version"; - if (swsusp_info.num_physpages != num_physpages) - return "memory size"; - if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) - return "system type"; - if (strcmp(swsusp_info.uts.release,system_utsname.release)) - return "kernel release"; - if (strcmp(swsusp_info.uts.version,system_utsname.version)) - return "version"; - if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) - return "machine"; -#if 0 - /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */ - if (swsusp_info.cpus != num_possible_cpus()) - return "number of cpus"; -#endif - return NULL; + if (handle->cur) + free_page((unsigned long)handle->cur); + handle->cur = NULL; } - -static int check_header(void) +static inline int get_swap_map_reader(struct swap_map_handle *handle, + swp_entry_t start) { - const char *reason = NULL; int error; - if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info))) + if (!swp_offset(start)) + return -EINVAL; + handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); + if (!handle->cur) + return -ENOMEM; + error = bio_read_page(swp_offset(start), handle->cur); + if (error) { + release_swap_map_reader(handle); return error; - - /* Is this same machine? */ - if ((reason = sanity_check())) { - printk(KERN_ERR "swsusp: Resume mismatch: %s\n",reason); - return -EPERM; } - nr_copy_pages = swsusp_info.image_pages; - return error; + handle->k = 0; + return 0; } -static int check_sig(void) +static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf) { + unsigned long offset; int error; - memset(&swsusp_header, 0, sizeof(swsusp_header)); - if ((error = bio_read_page(0, &swsusp_header))) - return error; - if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { - memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); - memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV); - memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV); - - /* - * Reset swap signature now. - */ - error = bio_write_page(0, &swsusp_header); - } else { + if (!handle->cur) + return -EINVAL; + offset = swp_offset(handle->cur->entries[handle->k]); + if (!offset) return -EINVAL; + error = bio_read_page(offset, buf); + if (error) + return error; + if (++handle->k >= MAP_PAGE_SIZE) { + handle->k = 0; + offset = swp_offset(handle->cur->next_swap); + if (!offset) + release_swap_map_reader(handle); + else + error = bio_read_page(offset, handle->cur); } - if (!error) - pr_debug("swsusp: Signature found, resuming\n"); return error; } -/** - * data_read - Read image pages from swap. - * - * You do not need to check for overlaps, check_pagedir() - * already did that. - */ - -static int data_read(struct pbe *pblist) +static int check_header(void) { - struct pbe *p; - int error = 0; - int i = 0; - int mod = swsusp_info.image_pages / 100; - void *tfm; - - if ((error = crypto_init(0, &tfm))) - return error; - - if (!mod) - mod = 1; - - printk("swsusp: Reading image data (%lu pages): ", - swsusp_info.image_pages); - - for_each_pbe (p, pblist) { - if (!(i % mod)) - printk("\b\b\b\b%3d%%", i / mod); + char *reason = NULL; - if ((error = crypto_read(p, tfm))) { - crypto_exit(tfm); - return error; - } - - i++; + dump_info(); + if (swsusp_info.version_code != LINUX_VERSION_CODE) + reason = "kernel version"; + if (swsusp_info.num_physpages != num_physpages) + reason = "memory size"; + if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname)) + reason = "system type"; + if (strcmp(swsusp_info.uts.release,system_utsname.release)) + reason = "kernel release"; + if (strcmp(swsusp_info.uts.version,system_utsname.version)) + reason = "version"; + if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) + reason = "machine"; + if (reason) { + printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); + return -EPERM; } - printk("\b\b\b\bdone\n"); - crypto_exit(tfm); - return error; + return 0; } /** - * read_pagedir - Read page backup list pages from swap + * load_image_data - load the image data using the swap map handle + * @handle and store them using the page backup list @pblist + * (assume there are @nr_pages pages to load) */ -static int read_pagedir(struct pbe *pblist) +static int load_image_data(struct pbe *pblist, + struct swap_map_handle *handle, + unsigned int nr_pages) { - struct pbe *pbpage, *p; - unsigned int i = 0; int error; + unsigned int m; + struct pbe *p; if (!pblist) - return -EFAULT; - - printk("swsusp: Reading pagedir (%lu pages)\n", - swsusp_info.pagedir_pages); - - for_each_pb_page (pbpage, pblist) { - unsigned long offset = swp_offset(swsusp_info.pagedir[i++]); - - error = -EFAULT; - if (offset) { - p = (pbpage + PB_PAGE_SKIP)->next; - error = bio_read_page(offset, (void *)pbpage); - (pbpage + PB_PAGE_SKIP)->next = p; - } + return -EINVAL; + printk("Loading image data pages (%u pages) ... ", nr_pages); + m = nr_pages / 100; + if (!m) + m = 1; + nr_pages = 0; + p = pblist; + while (p) { + error = swap_map_read_page(handle, (void *)p->address); if (error) break; + p = p->next; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; } - if (!error) - BUG_ON(i != swsusp_info.pagedir_pages); - + printk("\b\b\b\bdone\n"); return error; } +/** + * unpack_orig_addresses - copy the elements of @buf[] (1 page) to + * the PBEs in the list starting at @pbe + */ -static int check_suspend_image(void) +static inline struct pbe *unpack_orig_addresses(unsigned long *buf, + struct pbe *pbe) { - int error = 0; + int j; - if ((error = check_sig())) - return error; - - if ((error = check_header())) - return error; - - return 0; + for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { + pbe->orig_address = buf[j]; + pbe = pbe->next; + } + return pbe; } -static int read_suspend_image(void) +/** + * load_image_metadata - load the image metadata using the swap map + * handle @handle and put them into the PBEs in the list @pblist + */ + +static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle) { - int error = 0; struct pbe *p; + unsigned long *buf; + unsigned int n = 0; + int error = 0; - if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0))) + printk("Loading image metadata ... "); + buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC); + if (!buf) return -ENOMEM; - - if ((error = read_pagedir(p))) - return error; - create_pbe_list(p, nr_copy_pages); - mark_unsafe_pages(p); - pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); - if (pagedir_nosave) { - create_pbe_list(pagedir_nosave, nr_copy_pages); - copy_page_backup_list(pagedir_nosave, p); + p = pblist; + while (p) { + error = swap_map_read_page(handle, buf); + if (error) + break; + p = unpack_orig_addresses(buf, p); + n++; } - free_pagedir(p); - if (!pagedir_nosave) - return -ENOMEM; + free_page((unsigned long)buf); + if (!error) + printk("done (%u pages loaded)\n", n); + return error; +} - /* Allocate memory for the image and read the data from swap */ +int swsusp_read(struct pbe **pblist_ptr) +{ + int error; + struct pbe *p, *pblist; + struct swap_map_handle handle; + unsigned int nr_pages; - error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1); + if (IS_ERR(resume_bdev)) { + pr_debug("swsusp: block device not initialised\n"); + return PTR_ERR(resume_bdev); + } + error = get_swap_map_reader(&handle, swsusp_header.image); if (!error) - error = data_read(pagedir_nosave); + error = swap_map_read_page(&handle, &swsusp_info); + if (!error) + error = check_header(); + if (error) + return error; + nr_pages = swsusp_info.image_pages; + p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0); + if (!p) + return -ENOMEM; + error = load_image_metadata(p, &handle); + if (!error) { + mark_unsafe_pages(p); + pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); + if (pblist) + copy_page_backup_list(pblist, p); + free_pagedir(p); + if (!pblist) + error = -ENOMEM; + + /* Allocate memory for the image and read the data from swap */ + if (!error) + error = alloc_data_pages(pblist, GFP_ATOMIC, 1); + if (!error) { + release_eaten_pages(); + error = load_image_data(pblist, &handle, nr_pages); + } + if (!error) + *pblist_ptr = pblist; + } + release_swap_map_reader(&handle); + blkdev_put(resume_bdev); + + if (!error) + pr_debug("swsusp: Reading resume file was successful\n"); + else + pr_debug("swsusp: Error %d resuming\n", error); return error; } /** - * swsusp_check - Check for saved image in swap + * swsusp_check - Check for swsusp signature in the resume device */ int swsusp_check(void) @@ -958,40 +1003,27 @@ int swsusp_check(void) resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); if (!IS_ERR(resume_bdev)) { set_blocksize(resume_bdev, PAGE_SIZE); - error = check_suspend_image(); + memset(&swsusp_header, 0, sizeof(swsusp_header)); + if ((error = bio_read_page(0, &swsusp_header))) + return error; + if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { + memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); + /* Reset swap signature now */ + error = bio_write_page(0, &swsusp_header); + } else { + return -EINVAL; + } if (error) - blkdev_put(resume_bdev); - } else + blkdev_put(resume_bdev); + else + pr_debug("swsusp: Signature found, resuming\n"); + } else { error = PTR_ERR(resume_bdev); - - if (!error) - pr_debug("swsusp: resume file found\n"); - else - pr_debug("swsusp: Error %d check for resume file\n", error); - return error; -} - -/** - * swsusp_read - Read saved image from swap. - */ - -int swsusp_read(void) -{ - int error; - - if (IS_ERR(resume_bdev)) { - pr_debug("swsusp: block device not initialised\n"); - return PTR_ERR(resume_bdev); } - error = read_suspend_image(); - blkdev_put(resume_bdev); - memset(key_iv, 0, MAXKEY+MAXIV); + if (error) + pr_debug("swsusp: Error %d check for resume file\n", error); - if (!error) - pr_debug("swsusp: Reading resume file was successful\n"); - else - pr_debug("swsusp: Error %d resuming\n", error); return error; } diff --git a/kernel/printk.c b/kernel/printk.c index 5287be83e3e..2251be80cd2 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -569,7 +569,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) p[1] <= '7' && p[2] == '>') { loglev_char = p[1]; p += 3; - printed_len += 3; + printed_len -= 3; } else { loglev_char = default_message_loglevel + '0'; @@ -584,7 +584,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) for (tp = tbuf; tp < tbuf + tlen; tp++) emit_log_char(*tp); - printed_len += tlen - 3; + printed_len += tlen; } else { if (p[0] != '<' || p[1] < '0' || p[1] > '7' || p[2] != '>') { @@ -592,8 +592,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) emit_log_char(default_message_loglevel + '0'); emit_log_char('>'); + printed_len += 3; } - printed_len += 3; } log_level_unknown = 0; if (!*p) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 17ee7e5a345..cceaf09ac41 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -241,7 +241,8 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in if (write) { copy_to_user_page(vma, page, addr, maddr + offset, buf, bytes); - set_page_dirty_lock(page); + if (!PageCompound(page)) + set_page_dirty_lock(page); } else { copy_from_user_page(vma, page, addr, buf, maddr + offset, bytes); @@ -407,54 +408,62 @@ int ptrace_request(struct task_struct *child, long request, return ret; } -#ifndef __ARCH_SYS_PTRACE -static int ptrace_get_task_struct(long request, long pid, - struct task_struct **childp) +/** + * ptrace_traceme -- helper for PTRACE_TRACEME + * + * Performs checks and sets PT_PTRACED. + * Should be used by all ptrace implementations for PTRACE_TRACEME. + */ +int ptrace_traceme(void) { - struct task_struct *child; int ret; /* - * Callers use child == NULL as an indication to exit early even - * when the return value is 0, so make sure it is non-NULL here. + * Are we already being traced? + */ + if (current->ptrace & PT_PTRACED) + return -EPERM; + ret = security_ptrace(current->parent, current); + if (ret) + return -EPERM; + /* + * Set the ptrace bit in the process ptrace flags. */ - *childp = NULL; + current->ptrace |= PT_PTRACED; + return 0; +} - if (request == PTRACE_TRACEME) { - /* - * Are we already being traced? - */ - if (current->ptrace & PT_PTRACED) - return -EPERM; - ret = security_ptrace(current->parent, current); - if (ret) - return -EPERM; - /* - * Set the ptrace bit in the process ptrace flags. - */ - current->ptrace |= PT_PTRACED; - return 0; - } +/** + * ptrace_get_task_struct -- grab a task struct reference for ptrace + * @pid: process id to grab a task_struct reference of + * + * This function is a helper for ptrace implementations. It checks + * permissions and then grabs a task struct for use of the actual + * ptrace implementation. + * + * Returns the task_struct for @pid or an ERR_PTR() on failure. + */ +struct task_struct *ptrace_get_task_struct(pid_t pid) +{ + struct task_struct *child; /* - * You may not mess with init + * Tracing init is not allowed. */ if (pid == 1) - return -EPERM; + return ERR_PTR(-EPERM); - ret = -ESRCH; read_lock(&tasklist_lock); child = find_task_by_pid(pid); if (child) get_task_struct(child); read_unlock(&tasklist_lock); if (!child) - return -ESRCH; - - *childp = child; - return 0; + return ERR_PTR(-ESRCH); + return child; } +#ifndef __ARCH_SYS_PTRACE asmlinkage long sys_ptrace(long request, long pid, long addr, long data) { struct task_struct *child; @@ -464,9 +473,16 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data) * This lock_kernel fixes a subtle race with suid exec */ lock_kernel(); - ret = ptrace_get_task_struct(request, pid, &child); - if (!child) + if (request == PTRACE_TRACEME) { + ret = ptrace_traceme(); goto out; + } + + child = ptrace_get_task_struct(pid); + if (IS_ERR(child)) { + ret = PTR_ERR(child); + goto out; + } if (request == PTRACE_ATTACH) { ret = ptrace_attach(child); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index c4d159a21e0..30b0bba0385 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -35,6 +35,7 @@ #include <linux/init.h> #include <linux/spinlock.h> #include <linux/smp.h> +#include <linux/rcupdate.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <asm/atomic.h> @@ -45,7 +46,6 @@ #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/rcupdate.h> -#include <linux/rcuref.h> #include <linux/cpu.h> /* Definition for rcupdate control block. */ @@ -61,9 +61,9 @@ struct rcu_state { /* for current batch to proceed. */ }; -static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp = +static struct rcu_state rcu_state ____cacheline_internodealigned_in_smp = {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; -static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp = +static struct rcu_state rcu_bh_state ____cacheline_internodealigned_in_smp = {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; @@ -73,19 +73,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; static int maxbatch = 10000; -#ifndef __HAVE_ARCH_CMPXCHG -/* - * We use an array of spinlocks for the rcurefs -- similar to ones in sparc - * 32 bit atomic_t implementations, and a hash function similar to that - * for our refcounting needs. - * Can't help multiprocessors which donot have cmpxchg :( - */ - -spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = { - [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED -}; -#endif - /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -116,6 +103,10 @@ void fastcall call_rcu(struct rcu_head *head, local_irq_restore(flags); } +static atomic_t rcu_barrier_cpu_count; +static struct semaphore rcu_barrier_sema; +static struct completion rcu_barrier_completion; + /** * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. * @head: structure to be used for queueing the RCU updates. @@ -162,6 +153,42 @@ long rcu_batches_completed(void) return rcu_ctrlblk.completed; } +static void rcu_barrier_callback(struct rcu_head *notused) +{ + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); +} + +/* + * Called with preemption disabled, and from cross-cpu IRQ context. + */ +static void rcu_barrier_func(void *notused) +{ + int cpu = smp_processor_id(); + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + struct rcu_head *head; + + head = &rdp->barrier; + atomic_inc(&rcu_barrier_cpu_count); + call_rcu(head, rcu_barrier_callback); +} + +/** + * rcu_barrier - Wait until all the in-flight RCUs are complete. + */ +void rcu_barrier(void) +{ + BUG_ON(in_interrupt()); + /* Take cpucontrol semaphore to protect against CPU hotplug */ + down(&rcu_barrier_sema); + init_completion(&rcu_barrier_completion); + atomic_set(&rcu_barrier_cpu_count, 0); + on_each_cpu(rcu_barrier_func, NULL, 0, 1); + wait_for_completion(&rcu_barrier_completion); + up(&rcu_barrier_sema); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + /* * Invoke the completed RCU callbacks. They are expected to be in * a per-cpu list. @@ -217,15 +244,23 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp, struct rcu_state *rsp, if (rcp->next_pending && rcp->completed == rcp->cur) { - /* Can't change, since spin lock held. */ - cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask); - rcp->next_pending = 0; - /* next_pending == 0 must be visible in __rcu_process_callbacks() - * before it can see new value of cur. + /* + * next_pending == 0 must be visible in + * __rcu_process_callbacks() before it can see new value of cur. */ smp_wmb(); rcp->cur++; + + /* + * Accessing nohz_cpu_mask before incrementing rcp->cur needs a + * Barrier Otherwise it can cause tickless idle CPUs to be + * included in rsp->cpumask, which will extend graceperiods + * unnecessarily. + */ + smp_mb(); + cpus_andnot(rsp->cpumask, cpu_online_map, nohz_cpu_mask); + } } @@ -457,6 +492,7 @@ static struct notifier_block __devinitdata rcu_nb = { */ void __init rcu_init(void) { + sema_init(&rcu_barrier_sema, 1); rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 88c28d47655..773219907dd 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -39,7 +39,6 @@ #include <linux/moduleparam.h> #include <linux/percpu.h> #include <linux/notifier.h> -#include <linux/rcuref.h> #include <linux/cpu.h> #include <linux/random.h> #include <linux/delay.h> @@ -49,9 +48,11 @@ MODULE_LICENSE("GPL"); static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ -static int stat_interval = 0; /* Interval between stats, in seconds. */ +static int stat_interval; /* Interval between stats, in seconds. */ /* Defaults to "only at end of test". */ -static int verbose = 0; /* Print more debug info. */ +static int verbose; /* Print more debug info. */ +static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ +static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ MODULE_PARM(nreaders, "i"); MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); @@ -59,6 +60,10 @@ MODULE_PARM(stat_interval, "i"); MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); MODULE_PARM(verbose, "i"); MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); +MODULE_PARM(test_no_idle_hz, "i"); +MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); +MODULE_PARM(shuffle_interval, "i"); +MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); #define TORTURE_FLAG "rcutorture: " #define PRINTK_STRING(s) \ do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) @@ -73,6 +78,7 @@ static int nrealreaders; static struct task_struct *writer_task; static struct task_struct **reader_tasks; static struct task_struct *stats_task; +static struct task_struct *shuffler_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -103,7 +109,7 @@ atomic_t n_rcu_torture_error; /* * Allocate an element from the rcu_tortures pool. */ -struct rcu_torture * +static struct rcu_torture * rcu_torture_alloc(void) { struct list_head *p; @@ -376,12 +382,77 @@ rcu_torture_stats(void *arg) return 0; } +static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ + +/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case + * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. + */ +void rcu_torture_shuffle_tasks(void) +{ + cpumask_t tmp_mask = CPU_MASK_ALL; + int i; + + lock_cpu_hotplug(); + + /* No point in shuffling if there is only one online CPU (ex: UP) */ + if (num_online_cpus() == 1) { + unlock_cpu_hotplug(); + return; + } + + if (rcu_idle_cpu != -1) + cpu_clear(rcu_idle_cpu, tmp_mask); + + set_cpus_allowed(current, tmp_mask); + + if (reader_tasks != NULL) { + for (i = 0; i < nrealreaders; i++) + if (reader_tasks[i]) + set_cpus_allowed(reader_tasks[i], tmp_mask); + } + + if (writer_task) + set_cpus_allowed(writer_task, tmp_mask); + + if (stats_task) + set_cpus_allowed(stats_task, tmp_mask); + + if (rcu_idle_cpu == -1) + rcu_idle_cpu = num_online_cpus() - 1; + else + rcu_idle_cpu--; + + unlock_cpu_hotplug(); +} + +/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the + * system to become idle at a time and cut off its timer ticks. This is meant + * to test the support for such tickless idle CPU in RCU. + */ +static int +rcu_torture_shuffle(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); + do { + schedule_timeout_interruptible(shuffle_interval * HZ); + rcu_torture_shuffle_tasks(); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); + return 0; +} + static void rcu_torture_cleanup(void) { int i; fullstop = 1; + if (shuffler_task != NULL) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); + kthread_stop(shuffler_task); + } + shuffler_task = NULL; + if (writer_task != NULL) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); kthread_stop(writer_task); @@ -409,9 +480,8 @@ rcu_torture_cleanup(void) stats_task = NULL; /* Wait for all RCU callbacks to fire. */ + rcu_barrier(); - for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) - synchronize_rcu(); rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ printk(KERN_ALERT TORTURE_FLAG "--- End of test: %s\n", @@ -431,9 +501,11 @@ rcu_torture_init(void) nrealreaders = nreaders; else nrealreaders = 2 * num_online_cpus(); - printk(KERN_ALERT TORTURE_FLAG - "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n", - nrealreaders, stat_interval, verbose); + printk(KERN_ALERT TORTURE_FLAG "--- Start of test: nreaders=%d " + "stat_interval=%d verbose=%d test_no_idle_hz=%d " + "shuffle_interval = %d\n", + nrealreaders, stat_interval, verbose, test_no_idle_hz, + shuffle_interval); fullstop = 0; /* Set up the freelist. */ @@ -503,6 +575,18 @@ rcu_torture_init(void) goto unwind; } } + if (test_no_idle_hz) { + rcu_idle_cpu = num_online_cpus() - 1; + /* Create the shuffler thread */ + shuffler_task = kthread_run(rcu_torture_shuffle, NULL, + "rcu_torture_shuffle"); + if (IS_ERR(shuffler_task)) { + firsterr = PTR_ERR(shuffler_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); + shuffler_task = NULL; + goto unwind; + } + } return 0; unwind: diff --git a/kernel/sched.c b/kernel/sched.c index 6f46c94cc29..92733091154 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -176,6 +176,13 @@ static unsigned int task_timeslice(task_t *p) #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ < (long long) (sd)->cache_hot_time) +void __put_task_struct_cb(struct rcu_head *rhp) +{ + __put_task_struct(container_of(rhp, struct task_struct, rcu)); +} + +EXPORT_SYMBOL_GPL(__put_task_struct_cb); + /* * These are the runqueue data structures: */ diff --git a/kernel/signal.c b/kernel/signal.c index d7611f189ef..08aa5b263f3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -329,13 +329,20 @@ void __exit_sighand(struct task_struct *tsk) /* Ok, we're done with the signal handlers */ tsk->sighand = NULL; if (atomic_dec_and_test(&sighand->count)) - kmem_cache_free(sighand_cachep, sighand); + sighand_free(sighand); } void exit_sighand(struct task_struct *tsk) { write_lock_irq(&tasklist_lock); - __exit_sighand(tsk); + rcu_read_lock(); + if (tsk->sighand != NULL) { + struct sighand_struct *sighand = rcu_dereference(tsk->sighand); + spin_lock(&sighand->siglock); + __exit_sighand(tsk); + spin_unlock(&sighand->siglock); + } + rcu_read_unlock(); write_unlock_irq(&tasklist_lock); } @@ -345,19 +352,20 @@ void exit_sighand(struct task_struct *tsk) void __exit_signal(struct task_struct *tsk) { struct signal_struct * sig = tsk->signal; - struct sighand_struct * sighand = tsk->sighand; + struct sighand_struct * sighand; if (!sig) BUG(); if (!atomic_read(&sig->count)) BUG(); + rcu_read_lock(); + sighand = rcu_dereference(tsk->sighand); spin_lock(&sighand->siglock); posix_cpu_timers_exit(tsk); if (atomic_dec_and_test(&sig->count)) { posix_cpu_timers_exit_group(tsk); - if (tsk == sig->curr_target) - sig->curr_target = next_thread(tsk); tsk->signal = NULL; + __exit_sighand(tsk); spin_unlock(&sighand->siglock); flush_sigqueue(&sig->shared_pending); } else { @@ -389,9 +397,11 @@ void __exit_signal(struct task_struct *tsk) sig->nvcsw += tsk->nvcsw; sig->nivcsw += tsk->nivcsw; sig->sched_time += tsk->sched_time; + __exit_sighand(tsk); spin_unlock(&sighand->siglock); sig = NULL; /* Marker for below. */ } + rcu_read_unlock(); clear_tsk_thread_flag(tsk,TIF_SIGPENDING); flush_sigqueue(&tsk->pending); if (sig) { @@ -613,6 +623,33 @@ void signal_wake_up(struct task_struct *t, int resume) * Returns 1 if any signals were found. * * All callers must be holding the siglock. + * + * This version takes a sigset mask and looks at all signals, + * not just those in the first mask word. + */ +static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) +{ + struct sigqueue *q, *n; + sigset_t m; + + sigandsets(&m, mask, &s->signal); + if (sigisemptyset(&m)) + return 0; + + signandsets(&s->signal, &s->signal, mask); + list_for_each_entry_safe(q, n, &s->list, list) { + if (sigismember(mask, q->info.si_signo)) { + list_del_init(&q->list); + __sigqueue_free(q); + } + } + return 1; +} +/* + * Remove signals in mask from the pending set and queue. + * Returns 1 if any signals were found. + * + * All callers must be holding the siglock. */ static int rm_from_queue(unsigned long mask, struct sigpending *s) { @@ -1080,18 +1117,29 @@ void zap_other_threads(struct task_struct *p) } /* - * Must be called with the tasklist_lock held for reading! + * Must be called under rcu_read_lock() or with tasklist_lock read-held. */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { unsigned long flags; + struct sighand_struct *sp; int ret; +retry: ret = check_kill_permission(sig, info, p); - if (!ret && sig && p->sighand) { - spin_lock_irqsave(&p->sighand->siglock, flags); + if (!ret && sig && (sp = rcu_dereference(p->sighand))) { + spin_lock_irqsave(&sp->siglock, flags); + if (p->sighand != sp) { + spin_unlock_irqrestore(&sp->siglock, flags); + goto retry; + } + if ((atomic_read(&sp->count) == 0) || + (atomic_read(&p->usage) == 0)) { + spin_unlock_irqrestore(&sp->siglock, flags); + return -ESRCH; + } ret = __group_send_sig_info(sig, info, p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); + spin_unlock_irqrestore(&sp->siglock, flags); } return ret; @@ -1136,14 +1184,21 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid) { int error; + int acquired_tasklist_lock = 0; struct task_struct *p; - read_lock(&tasklist_lock); + rcu_read_lock(); + if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) { + read_lock(&tasklist_lock); + acquired_tasklist_lock = 1; + } p = find_task_by_pid(pid); error = -ESRCH; if (p) error = group_send_sig_info(sig, info, p); - read_unlock(&tasklist_lock); + if (unlikely(acquired_tasklist_lock)) + read_unlock(&tasklist_lock); + rcu_read_unlock(); return error; } @@ -1163,8 +1218,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, ret = -ESRCH; goto out_unlock; } - if ((!info || ((unsigned long)info != 1 && - (unsigned long)info != 2 && SI_FROMUSER(info))) + if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) && (euid != p->suid) && (euid != p->uid) && (uid != p->suid) && (uid != p->uid)) { ret = -EPERM; @@ -1355,16 +1409,54 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) { unsigned long flags; int ret = 0; + struct sighand_struct *sh; BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); - read_lock(&tasklist_lock); + + /* + * The rcu based delayed sighand destroy makes it possible to + * run this without tasklist lock held. The task struct itself + * cannot go away as create_timer did get_task_struct(). + * + * We return -1, when the task is marked exiting, so + * posix_timer_event can redirect it to the group leader + */ + rcu_read_lock(); if (unlikely(p->flags & PF_EXITING)) { ret = -1; goto out_err; } - spin_lock_irqsave(&p->sighand->siglock, flags); +retry: + sh = rcu_dereference(p->sighand); + + spin_lock_irqsave(&sh->siglock, flags); + if (p->sighand != sh) { + /* We raced with exec() in a multithreaded process... */ + spin_unlock_irqrestore(&sh->siglock, flags); + goto retry; + } + + /* + * We do the check here again to handle the following scenario: + * + * CPU 0 CPU 1 + * send_sigqueue + * check PF_EXITING + * interrupt exit code running + * __exit_signal + * lock sighand->siglock + * unlock sighand->siglock + * lock sh->siglock + * add(tsk->pending) flush_sigqueue(tsk->pending) + * + */ + + if (unlikely(p->flags & PF_EXITING)) { + ret = -1; + goto out; + } if (unlikely(!list_empty(&q->list))) { /* @@ -1388,9 +1480,9 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) signal_wake_up(p, sig == SIGKILL); out: - spin_unlock_irqrestore(&p->sighand->siglock, flags); + spin_unlock_irqrestore(&sh->siglock, flags); out_err: - read_unlock(&tasklist_lock); + rcu_read_unlock(); return ret; } @@ -1402,7 +1494,9 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) int ret = 0; BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); + read_lock(&tasklist_lock); + /* Since it_lock is held, p->sighand cannot be NULL. */ spin_lock_irqsave(&p->sighand->siglock, flags); handle_stop_signal(sig, p); @@ -1436,7 +1530,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) out: spin_unlock_irqrestore(&p->sighand->siglock, flags); read_unlock(&tasklist_lock); - return(ret); + return ret; } /* @@ -2338,6 +2432,7 @@ int do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) { struct k_sigaction *k; + sigset_t mask; if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) return -EINVAL; @@ -2385,9 +2480,11 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) *k = *act; sigdelsetmask(&k->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); - rm_from_queue(sigmask(sig), &t->signal->shared_pending); + sigemptyset(&mask); + sigaddset(&mask, sig); + rm_from_queue_full(&mask, &t->signal->shared_pending); do { - rm_from_queue(sigmask(sig), &t->pending); + rm_from_queue_full(&mask, &t->pending); recalc_sigpending_tsk(t); t = next_thread(t); } while (t != current); diff --git a/kernel/sys.c b/kernel/sys.c index bce933ebb29..b6941e06d5d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -32,6 +32,7 @@ #include <linux/compat.h> #include <linux/syscalls.h> +#include <linux/kprobes.h> #include <asm/uaccess.h> #include <asm/io.h> @@ -168,7 +169,7 @@ EXPORT_SYMBOL(notifier_chain_unregister); * of the last notifier function called. */ -int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) +int __kprobes notifier_call_chain(struct notifier_block **n, unsigned long val, void *v) { int ret=NOTIFY_DONE; struct notifier_block *nb = *n; @@ -488,6 +489,12 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user magic2 != LINUX_REBOOT_MAGIC2C)) return -EINVAL; + /* Instead of trying to make the power_off code look like + * halt when pm_power_off is not set do it the easy way. + */ + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) + cmd = LINUX_REBOOT_CMD_HALT; + lock_kernel(); switch (cmd) { case LINUX_REBOOT_CMD_RESTART: @@ -1083,10 +1090,11 @@ asmlinkage long sys_times(struct tms __user * tbuf) asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) { struct task_struct *p; + struct task_struct *group_leader = current->group_leader; int err = -EINVAL; if (!pid) - pid = current->pid; + pid = group_leader->pid; if (!pgid) pgid = pid; if (pgid < 0) @@ -1106,16 +1114,16 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (!thread_group_leader(p)) goto out; - if (p->parent == current || p->real_parent == current) { + if (p->real_parent == group_leader) { err = -EPERM; - if (p->signal->session != current->signal->session) + if (p->signal->session != group_leader->signal->session) goto out; err = -EACCES; if (p->did_exec) goto out; } else { err = -ESRCH; - if (p != current) + if (p != group_leader) goto out; } @@ -1127,7 +1135,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) struct task_struct *p; do_each_task_pid(pgid, PIDTYPE_PGID, p) { - if (p->signal->session == current->signal->session) + if (p->signal->session == group_leader->signal->session) goto ok_pgid; } while_each_task_pid(pgid, PIDTYPE_PGID, p); goto out; @@ -1207,24 +1215,22 @@ asmlinkage long sys_getsid(pid_t pid) asmlinkage long sys_setsid(void) { + struct task_struct *group_leader = current->group_leader; struct pid *pid; int err = -EPERM; - if (!thread_group_leader(current)) - return -EINVAL; - down(&tty_sem); write_lock_irq(&tasklist_lock); - pid = find_pid(PIDTYPE_PGID, current->pid); + pid = find_pid(PIDTYPE_PGID, group_leader->pid); if (pid) goto out; - current->signal->leader = 1; - __set_special_pids(current->pid, current->pid); - current->signal->tty = NULL; - current->signal->tty_old_pgrp = 0; - err = process_group(current); + group_leader->signal->leader = 1; + __set_special_pids(group_leader->pid, group_leader->pid); + group_leader->signal->tty = NULL; + group_leader->signal->tty_old_pgrp = 0; + err = process_group(group_leader); out: write_unlock_irq(&tasklist_lock); up(&tty_sem); @@ -1686,7 +1692,10 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) if (unlikely(!p->signal)) return; + utime = stime = cputime_zero; + switch (who) { + case RUSAGE_BOTH: case RUSAGE_CHILDREN: spin_lock_irqsave(&p->sighand->siglock, flags); utime = p->signal->cutime; @@ -1696,22 +1705,11 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_minflt = p->signal->cmin_flt; r->ru_majflt = p->signal->cmaj_flt; spin_unlock_irqrestore(&p->sighand->siglock, flags); - cputime_to_timeval(utime, &r->ru_utime); - cputime_to_timeval(stime, &r->ru_stime); - break; + + if (who == RUSAGE_CHILDREN) + break; + case RUSAGE_SELF: - spin_lock_irqsave(&p->sighand->siglock, flags); - utime = stime = cputime_zero; - goto sum_group; - case RUSAGE_BOTH: - spin_lock_irqsave(&p->sighand->siglock, flags); - utime = p->signal->cutime; - stime = p->signal->cstime; - r->ru_nvcsw = p->signal->cnvcsw; - r->ru_nivcsw = p->signal->cnivcsw; - r->ru_minflt = p->signal->cmin_flt; - r->ru_majflt = p->signal->cmaj_flt; - sum_group: utime = cputime_add(utime, p->signal->utime); stime = cputime_add(stime, p->signal->stime); r->ru_nvcsw += p->signal->nvcsw; @@ -1728,13 +1726,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt += t->maj_flt; t = next_thread(t); } while (t != p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - cputime_to_timeval(utime, &r->ru_utime); - cputime_to_timeval(stime, &r->ru_stime); break; + default: BUG(); } + + cputime_to_timeval(utime, &r->ru_utime); + cputime_to_timeval(stime, &r->ru_stime); } int getrusage(struct task_struct *p, int who, struct rusage __user *ru) diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 1ab2370e2ef..bd3b9bfcfce 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -82,6 +82,28 @@ cond_syscall(compat_sys_socketcall); cond_syscall(sys_inotify_init); cond_syscall(sys_inotify_add_watch); cond_syscall(sys_inotify_rm_watch); +cond_syscall(sys_migrate_pages); +cond_syscall(sys_chown16); +cond_syscall(sys_fchown16); +cond_syscall(sys_getegid16); +cond_syscall(sys_geteuid16); +cond_syscall(sys_getgid16); +cond_syscall(sys_getgroups16); +cond_syscall(sys_getresgid16); +cond_syscall(sys_getresuid16); +cond_syscall(sys_getuid16); +cond_syscall(sys_lchown16); +cond_syscall(sys_setfsgid16); +cond_syscall(sys_setfsuid16); +cond_syscall(sys_setgid16); +cond_syscall(sys_setgroups16); +cond_syscall(sys_setregid16); +cond_syscall(sys_setresgid16); +cond_syscall(sys_setresuid16); +cond_syscall(sys_setreuid16); +cond_syscall(sys_setuid16); +cond_syscall(sys_vm86old); +cond_syscall(sys_vm86); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9990e10192e..03b0598f236 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -31,6 +31,7 @@ #include <linux/smp_lock.h> #include <linux/init.h> #include <linux/kernel.h> +#include <linux/kobject.h> #include <linux/net.h> #include <linux/sysrq.h> #include <linux/highuid.h> @@ -67,6 +68,8 @@ extern int min_free_kbytes; extern int printk_ratelimit_jiffies; extern int printk_ratelimit_burst; extern int pid_max_min, pid_max_max; +extern int sysctl_drop_caches; +extern int percpu_pagelist_fraction; #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) int unknown_nmi_panic; @@ -77,15 +80,13 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; static int minolduid; +static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; #ifdef CONFIG_KMOD extern char modprobe_path[]; #endif -#ifdef CONFIG_HOTPLUG -extern char hotplug_path[]; -#endif #ifdef CONFIG_CHR_DEV_SG extern int sg_big_buff; #endif @@ -110,7 +111,7 @@ extern int pwrsw_enabled; extern int unaligned_enabled; #endif -#ifdef CONFIG_ARCH_S390 +#ifdef CONFIG_S390 #ifdef CONFIG_MATHEMU extern int sysctl_ieee_emulation_warnings; #endif @@ -397,8 +398,8 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_HOTPLUG, .procname = "hotplug", - .data = &hotplug_path, - .maxlen = HOTPLUG_PATH_LEN, + .data = &uevent_helper, + .maxlen = UEVENT_HELPER_PATH_LEN, .mode = 0644, .proc_handler = &proc_dostring, .strategy = &sysctl_string, @@ -544,7 +545,7 @@ static ctl_table kern_table[] = { .extra1 = &minolduid, .extra2 = &maxolduid, }, -#ifdef CONFIG_ARCH_S390 +#ifdef CONFIG_S390 #ifdef CONFIG_MATHEMU { .ctl_name = KERN_IEEE_EMULATION_WARNINGS, @@ -646,7 +647,7 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#if defined(CONFIG_ARCH_S390) +#if defined(CONFIG_S390) { .ctl_name = KERN_SPIN_RETRY, .procname = "spin_retry", @@ -777,6 +778,15 @@ static ctl_table vm_table[] = { .strategy = &sysctl_intvec, }, { + .ctl_name = VM_DROP_PAGECACHE, + .procname = "drop_caches", + .data = &sysctl_drop_caches, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = drop_caches_sysctl_handler, + .strategy = &sysctl_intvec, + }, + { .ctl_name = VM_MIN_FREE_KBYTES, .procname = "min_free_kbytes", .data = &min_free_kbytes, @@ -786,6 +796,16 @@ static ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, + { + .ctl_name = VM_PERCPU_PAGELIST_FRACTION, + .procname = "percpu_pagelist_fraction", + .data = &percpu_pagelist_fraction, + .maxlen = sizeof(percpu_pagelist_fraction), + .mode = 0644, + .proc_handler = &percpu_pagelist_fraction_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &min_percpu_pagelist_fract, + }, #ifdef CONFIG_MMU { .ctl_name = VM_MAX_MAP_COUNT, @@ -2192,29 +2212,32 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen, void **context) { - size_t l, len; - if (!table->data || !table->maxlen) return -ENOTDIR; if (oldval && oldlenp) { - if (get_user(len, oldlenp)) + size_t bufsize; + if (get_user(bufsize, oldlenp)) return -EFAULT; - if (len) { - l = strlen(table->data); - if (len > l) len = l; - if (len >= table->maxlen) + if (bufsize) { + size_t len = strlen(table->data), copied; + + /* This shouldn't trigger for a well-formed sysctl */ + if (len > table->maxlen) len = table->maxlen; - if(copy_to_user(oldval, table->data, len)) - return -EFAULT; - if(put_user(0, ((char __user *) oldval) + len)) + + /* Copy up to a max of bufsize-1 bytes of the string */ + copied = (len >= bufsize) ? bufsize - 1 : len; + + if (copy_to_user(oldval, table->data, copied) || + put_user(0, (char __user *)(oldval + copied))) return -EFAULT; - if(put_user(len, oldlenp)) + if (put_user(len, oldlenp)) return -EFAULT; } } if (newval && newlen) { - len = newlen; + size_t len = newlen; if (len > table->maxlen) len = table->maxlen; if(copy_from_user(table->data, newval, len)) @@ -2223,7 +2246,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, len--; ((char *) table->data)[len] = 0; } - return 0; + return 1; } /* diff --git a/kernel/time.c b/kernel/time.c index 245d595a13c..b94bfa8c03e 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -561,6 +561,28 @@ void getnstimeofday(struct timespec *tv) EXPORT_SYMBOL_GPL(getnstimeofday); #endif +void getnstimestamp(struct timespec *ts) +{ + unsigned int seq; + struct timespec wall2mono; + + /* synchronize with settimeofday() changes */ + do { + seq = read_seqbegin(&xtime_lock); + getnstimeofday(ts); + wall2mono = wall_to_monotonic; + } while(unlikely(read_seqretry(&xtime_lock, seq))); + + /* adjust to monotonicaly-increasing values */ + ts->tv_sec += wall2mono.tv_sec; + ts->tv_nsec += wall2mono.tv_nsec; + while (unlikely(ts->tv_nsec >= NSEC_PER_SEC)) { + ts->tv_nsec -= NSEC_PER_SEC; + ts->tv_sec++; + } +} +EXPORT_SYMBOL_GPL(getnstimestamp); + #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) { diff --git a/kernel/timer.c b/kernel/timer.c index fd74268d866..074b4bd5cfd 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -33,6 +33,7 @@ #include <linux/posix-timers.h> #include <linux/cpu.h> #include <linux/syscalls.h> +#include <linux/delay.h> #include <asm/uaccess.h> #include <asm/unistd.h> diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 42df83d7fad..82c4fa70595 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -29,7 +29,8 @@ #include <linux/kthread.h> /* - * The per-CPU workqueue (if single thread, we always use cpu 0's). + * The per-CPU workqueue (if single thread, we always use the first + * possible cpu). * * The sequence counters are for flush_scheduled_work(). It wants to wait * until until all currently-scheduled works are completed, but it doesn't @@ -69,6 +70,8 @@ struct workqueue_struct { static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); +static int singlethread_cpu; + /* If it's single threaded, it isn't in the list of workqueues. */ static inline int is_single_threaded(struct workqueue_struct *wq) { @@ -102,7 +105,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) if (!test_and_set_bit(0, &work->pending)) { if (unlikely(is_single_threaded(wq))) - cpu = 0; + cpu = singlethread_cpu; BUG_ON(!list_empty(&work->entry)); __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); ret = 1; @@ -118,7 +121,7 @@ static void delayed_work_timer_fn(unsigned long __data) int cpu = smp_processor_id(); if (unlikely(is_single_threaded(wq))) - cpu = 0; + cpu = singlethread_cpu; __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); } @@ -266,8 +269,8 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) might_sleep(); if (is_single_threaded(wq)) { - /* Always use cpu 0's area. */ - flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, 0)); + /* Always use first cpu's area. */ + flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu)); } else { int cpu; @@ -315,12 +318,17 @@ struct workqueue_struct *__create_workqueue(const char *name, return NULL; wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); + if (!wq->cpu_wq) { + kfree(wq); + return NULL; + } + wq->name = name; /* We don't need the distraction of CPUs appearing and vanishing. */ lock_cpu_hotplug(); if (singlethread) { INIT_LIST_HEAD(&wq->list); - p = create_workqueue_thread(wq, 0); + p = create_workqueue_thread(wq, singlethread_cpu); if (!p) destroy = 1; else @@ -374,7 +382,7 @@ void destroy_workqueue(struct workqueue_struct *wq) /* We don't need the distraction of CPUs appearing and vanishing. */ lock_cpu_hotplug(); if (is_single_threaded(wq)) - cleanup_workqueue_thread(wq, 0); + cleanup_workqueue_thread(wq, singlethread_cpu); else { for_each_online_cpu(cpu) cleanup_workqueue_thread(wq, cpu); @@ -419,6 +427,25 @@ int schedule_delayed_work_on(int cpu, return ret; } +int schedule_on_each_cpu(void (*func) (void *info), void *info) +{ + int cpu; + struct work_struct *work; + + work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); + + if (!work) + return -ENOMEM; + for_each_online_cpu(cpu) { + INIT_WORK(work + cpu, func, info); + __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), + work + cpu); + } + flush_workqueue(keventd_wq); + kfree(work); + return 0; +} + void flush_scheduled_work(void) { flush_workqueue(keventd_wq); @@ -543,6 +570,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, void init_workqueues(void) { + singlethread_cpu = first_cpu(cpu_possible_map); hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); |