summaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c179
1 files changed, 160 insertions, 19 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b602f73fb38..8c3c400cce9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -18,7 +18,6 @@
* distribution for more details.
*/
-#include <linux/config.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/cpuset.h>
@@ -241,7 +240,7 @@ static struct super_block *cpuset_sb;
* A cpuset can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cpusets is empty. Since all
* tasks in the system use _some_ cpuset, and since there is always at
- * least one task in the system (init, pid == 1), therefore, top_cpuset
+ * least one task in the system (init), therefore, top_cpuset
* always has either children cpusets and/or using tasks. So we don't
* need a special hack to ensure that top_cpuset cannot be deleted.
*
@@ -290,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode)
inode->i_mode = mode;
inode->i_uid = current->fsuid;
inode->i_gid = current->fsgid;
- inode->i_blksize = PAGE_CACHE_SIZE;
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
@@ -763,6 +761,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
*
* Call with manage_mutex held. May nest a call to the
* lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ * Must not be called holding callback_mutex, because we must
+ * not call lock_cpu_hotplug() while holding callback_mutex.
*/
static void update_cpu_domains(struct cpuset *cur)
@@ -782,7 +782,7 @@ static void update_cpu_domains(struct cpuset *cur)
if (is_cpu_exclusive(c))
cpus_andnot(pspan, pspan, c->cpus_allowed);
}
- if (is_removed(cur) || !is_cpu_exclusive(cur)) {
+ if (!is_cpu_exclusive(cur)) {
cpus_or(pspan, pspan, cur->cpus_allowed);
if (cpus_equal(pspan, cur->cpus_allowed))
return;
@@ -815,6 +815,10 @@ static int update_cpumask(struct cpuset *cs, char *buf)
struct cpuset trialcs;
int retval, cpus_unchanged;
+ /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+ if (cs == &top_cpuset)
+ return -EACCES;
+
trialcs = *cs;
retval = cpulist_parse(buf, trialcs.cpus_allowed);
if (retval < 0)
@@ -908,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
int fudge;
int retval;
+ /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */
+ if (cs == &top_cpuset)
+ return -EACCES;
+
trialcs = *cs;
retval = nodelist_parse(buf, trialcs.mems_allowed);
if (retval < 0)
@@ -1064,7 +1072,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
}
/*
- * Frequency meter - How fast is some event occuring?
+ * Frequency meter - How fast is some event occurring?
*
* These routines manage a digitally filtered, constant time based,
* event frequency meter. There are four routines:
@@ -1217,7 +1225,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
task_lock(tsk);
oldcs = tsk->cpuset;
- if (!oldcs) {
+ /*
+ * After getting 'oldcs' cpuset ptr, be sure still not exiting.
+ * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
+ * then fail this attach_task(), to avoid breaking top_cpuset.count.
+ */
+ if (tsk->flags & PF_EXITING) {
task_unlock(tsk);
mutex_unlock(&callback_mutex);
put_task_struct(tsk);
@@ -1918,6 +1931,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
}
+/*
+ * Locking note on the strange update_flag() call below:
+ *
+ * If the cpuset being removed is marked cpu_exclusive, then simulate
+ * turning cpu_exclusive off, which will call update_cpu_domains().
+ * The lock_cpu_hotplug() call in update_cpu_domains() must not be
+ * made while holding callback_mutex. Elsewhere the kernel nests
+ * callback_mutex inside lock_cpu_hotplug() calls. So the reverse
+ * nesting would risk an ABBA deadlock.
+ */
+
static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cpuset *cs = dentry->d_fsdata;
@@ -1937,11 +1961,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
mutex_unlock(&manage_mutex);
return -EBUSY;
}
+ if (is_cpu_exclusive(cs)) {
+ int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
+ if (retval < 0) {
+ mutex_unlock(&manage_mutex);
+ return retval;
+ }
+ }
parent = cs->parent;
mutex_lock(&callback_mutex);
set_bit(CS_REMOVED, &cs->flags);
- if (is_cpu_exclusive(cs))
- update_cpu_domains(cs);
list_del(&cs->sibling); /* delete my sibling from parent->children */
spin_lock(&cs->dentry->d_lock);
d = dget(cs->dentry);
@@ -2016,6 +2045,104 @@ out:
return err;
}
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
+ * or memory nodes, we need to walk over the cpuset hierarchy,
+ * removing that CPU or node from all cpusets. If this removes the
+ * last CPU or node from a cpuset, then the guarantee_online_cpus()
+ * or guarantee_online_mems() code will use that emptied cpusets
+ * parent online CPUs or nodes. Cpusets that were already empty of
+ * CPUs or nodes are left empty.
+ *
+ * This routine is intentionally inefficient in a couple of regards.
+ * It will check all cpusets in a subtree even if the top cpuset of
+ * the subtree has no offline CPUs or nodes. It checks both CPUs and
+ * nodes, even though the caller could have been coded to know that
+ * only one of CPUs or nodes needed to be checked on a given call.
+ * This was done to minimize text size rather than cpu cycles.
+ *
+ * Call with both manage_mutex and callback_mutex held.
+ *
+ * Recursive, on depth of cpuset subtree.
+ */
+
+static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
+{
+ struct cpuset *c;
+
+ /* Each of our child cpusets mems must be online */
+ list_for_each_entry(c, &cur->children, sibling) {
+ guarantee_online_cpus_mems_in_subtree(c);
+ if (!cpus_empty(c->cpus_allowed))
+ guarantee_online_cpus(c, &c->cpus_allowed);
+ if (!nodes_empty(c->mems_allowed))
+ guarantee_online_mems(c, &c->mems_allowed);
+ }
+}
+
+/*
+ * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
+ * cpu_online_map and node_online_map. Force the top cpuset to track
+ * whats online after any CPU or memory node hotplug or unplug event.
+ *
+ * To ensure that we don't remove a CPU or node from the top cpuset
+ * that is currently in use by a child cpuset (which would violate
+ * the rule that cpusets must be subsets of their parent), we first
+ * call the recursive routine guarantee_online_cpus_mems_in_subtree().
+ *
+ * Since there are two callers of this routine, one for CPU hotplug
+ * events and one for memory node hotplug events, we could have coded
+ * two separate routines here. We code it as a single common routine
+ * in order to minimize text size.
+ */
+
+static void common_cpu_mem_hotplug_unplug(void)
+{
+ mutex_lock(&manage_mutex);
+ mutex_lock(&callback_mutex);
+
+ guarantee_online_cpus_mems_in_subtree(&top_cpuset);
+ top_cpuset.cpus_allowed = cpu_online_map;
+ top_cpuset.mems_allowed = node_online_map;
+
+ mutex_unlock(&callback_mutex);
+ mutex_unlock(&manage_mutex);
+}
+#endif
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * The top_cpuset tracks what CPUs and Memory Nodes are online,
+ * period. This is necessary in order to make cpusets transparent
+ * (of no affect) on systems that are actively using CPU hotplug
+ * but making no active use of cpusets.
+ *
+ * This routine ensures that top_cpuset.cpus_allowed tracks
+ * cpu_online_map on each CPU hotplug (cpuhp) event.
+ */
+
+static int cpuset_handle_cpuhp(struct notifier_block *nb,
+ unsigned long phase, void *cpu)
+{
+ common_cpu_mem_hotplug_unplug();
+ return 0;
+}
+#endif
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+/*
+ * Keep top_cpuset.mems_allowed tracking node_online_map.
+ * Call this routine anytime after you change node_online_map.
+ * See also the previous routine cpuset_handle_cpuhp().
+ */
+
+void cpuset_track_online_nodes()
+{
+ common_cpu_mem_hotplug_unplug();
+}
+#endif
+
/**
* cpuset_init_smp - initialize cpus_allowed
*
@@ -2026,6 +2153,8 @@ void __init cpuset_init_smp(void)
{
top_cpuset.cpus_allowed = cpu_online_map;
top_cpuset.mems_allowed = node_online_map;
+
+ hotcpu_notifier(cpuset_handle_cpuhp, 0);
}
/**
@@ -2195,7 +2324,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
int i;
for (i = 0; zl->zones[i]; i++) {
- int nid = zl->zones[i]->zone_pgdat->node_id;
+ int nid = zone_to_nid(zl->zones[i]);
if (node_isset(nid, current->mems_allowed))
return 1;
@@ -2266,9 +2395,9 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
const struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
- if (in_interrupt())
+ if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
- node = z->zone_pgdat->node_id;
+ node = zone_to_nid(z);
might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
if (node_isset(node, current->mems_allowed))
return 1;
@@ -2370,7 +2499,7 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
int cpuset_excl_nodes_overlap(const struct task_struct *p)
{
const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
- int overlap = 0; /* do cpusets overlap? */
+ int overlap = 1; /* do cpusets overlap? */
task_lock(current);
if (current->flags & PF_EXITING) {
@@ -2442,31 +2571,43 @@ void __cpuset_memory_pressure_bump(void)
*/
static int proc_cpuset_show(struct seq_file *m, void *v)
{
+ struct pid *pid;
struct task_struct *tsk;
char *buf;
- int retval = 0;
+ int retval;
+ retval = -ENOMEM;
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!buf)
- return -ENOMEM;
+ goto out;
+
+ retval = -ESRCH;
+ pid = m->private;
+ tsk = get_pid_task(pid, PIDTYPE_PID);
+ if (!tsk)
+ goto out_free;
- tsk = m->private;
+ retval = -EINVAL;
mutex_lock(&manage_mutex);
+
retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
if (retval < 0)
- goto out;
+ goto out_unlock;
seq_puts(m, buf);
seq_putc(m, '\n');
-out:
+out_unlock:
mutex_unlock(&manage_mutex);
+ put_task_struct(tsk);
+out_free:
kfree(buf);
+out:
return retval;
}
static int cpuset_open(struct inode *inode, struct file *file)
{
- struct task_struct *tsk = PROC_I(inode)->task;
- return single_open(file, proc_cpuset_show, tsk);
+ struct pid *pid = PROC_I(inode)->pid;
+ return single_open(file, proc_cpuset_show, pid);
}
struct file_operations proc_cpuset_operations = {