diff options
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r-- | kernel/cgroup.c | 587 |
1 files changed, 422 insertions, 165 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 909a35510af..2731d115d72 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,6 +57,7 @@ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/eventfd.h> #include <linux/poll.h> +#include <linux/flex_array.h> /* used in cgroup_attach_proc */ #include <asm/atomic.h> @@ -1735,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) } EXPORT_SYMBOL_GPL(cgroup_path); +/* + * cgroup_task_migrate - move a task from one cgroup to another. + * + * 'guarantee' is set if the caller promises that a new css_set for the task + * will already exist. If not set, this function might sleep, and can fail with + * -ENOMEM. Otherwise, it can only fail with -ESRCH. + */ +static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, + struct task_struct *tsk, bool guarantee) +{ + struct css_set *oldcg; + struct css_set *newcg; + + /* + * get old css_set. we need to take task_lock and refcount it, because + * an exiting task can change its css_set to init_css_set and drop its + * old one without taking cgroup_mutex. + */ + task_lock(tsk); + oldcg = tsk->cgroups; + get_css_set(oldcg); + task_unlock(tsk); + + /* locate or allocate a new css_set for this task. */ + if (guarantee) { + /* we know the css_set we want already exists. */ + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + read_lock(&css_set_lock); + newcg = find_existing_css_set(oldcg, cgrp, template); + BUG_ON(!newcg); + get_css_set(newcg); + read_unlock(&css_set_lock); + } else { + might_sleep(); + /* find_css_set will give us newcg already referenced. */ + newcg = find_css_set(oldcg, cgrp); + if (!newcg) { + put_css_set(oldcg); + return -ENOMEM; + } + } + put_css_set(oldcg); + + /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + task_unlock(tsk); + put_css_set(newcg); + return -ESRCH; + } + rcu_assign_pointer(tsk->cgroups, newcg); + task_unlock(tsk); + + /* Update the css_set linked lists if we're using them */ + write_lock(&css_set_lock); + if (!list_empty(&tsk->cg_list)) + list_move(&tsk->cg_list, &newcg->tasks); + write_unlock(&css_set_lock); + + /* + * We just gained a reference on oldcg by taking it from the task. As + * trading it for newcg is protected by cgroup_mutex, we're safe to drop + * it here; it will be freed under RCU. + */ + put_css_set(oldcg); + + set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + return 0; +} + /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' * @cgrp: the cgroup the task is attaching to @@ -1745,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path); */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { - int retval = 0; + int retval; struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroup *oldcgrp; - struct css_set *cg; - struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; /* Nothing to do if the task is already in that cgroup */ @@ -1759,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk, false); + retval = ss->can_attach(ss, cgrp, tsk); if (retval) { /* * Remember on which subsystem the can_attach() @@ -1771,46 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) goto out; } } + if (ss->can_attach_task) { + retval = ss->can_attach_task(cgrp, tsk); + if (retval) { + failed_ss = ss; + goto out; + } + } } - task_lock(tsk); - cg = tsk->cgroups; - get_css_set(cg); - task_unlock(tsk); - /* - * Locate or allocate a new css_set for this task, - * based on its final set of cgroups - */ - newcg = find_css_set(cg, cgrp); - put_css_set(cg); - if (!newcg) { - retval = -ENOMEM; - goto out; - } - - task_lock(tsk); - if (tsk->flags & PF_EXITING) { - task_unlock(tsk); - put_css_set(newcg); - retval = -ESRCH; + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); + if (retval) goto out; - } - rcu_assign_pointer(tsk->cgroups, newcg); - task_unlock(tsk); - - /* Update the css_set linked lists if we're using them */ - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) - list_move(&tsk->cg_list, &newcg->tasks); - write_unlock(&css_set_lock); for_each_subsys(root, ss) { + if (ss->pre_attach) + ss->pre_attach(cgrp); + if (ss->attach_task) + ss->attach_task(cgrp, tsk); if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk, false); + ss->attach(ss, cgrp, oldcgrp, tsk); } - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + synchronize_rcu(); - put_css_set(cg); /* * wake up rmdir() waiter. the rmdir should fail since the cgroup @@ -1829,7 +1881,7 @@ out: */ break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, tsk, false); + ss->cancel_attach(ss, cgrp, tsk); } } return retval; @@ -1860,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /* - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex - * held. May take task_lock of task + * cgroup_attach_proc works in two stages, the first of which prefetches all + * new css_sets needed (to make sure we have enough memory before committing + * to the move) and stores them in a list of entries of the following type. + * TODO: possible optimization: use css_set->rcu_head for chaining instead + */ +struct cg_list_entry { + struct css_set *cg; + struct list_head links; +}; + +static bool css_set_check_fetched(struct cgroup *cgrp, + struct task_struct *tsk, struct css_set *cg, + struct list_head *newcg_list) +{ + struct css_set *newcg; + struct cg_list_entry *cg_entry; + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + + read_lock(&css_set_lock); + newcg = find_existing_css_set(cg, cgrp, template); + if (newcg) + get_css_set(newcg); + read_unlock(&css_set_lock); + + /* doesn't exist at all? */ + if (!newcg) + return false; + /* see if it's already in the list */ + list_for_each_entry(cg_entry, newcg_list, links) { + if (cg_entry->cg == newcg) { + put_css_set(newcg); + return true; + } + } + + /* not found */ + put_css_set(newcg); + return false; +} + +/* + * Find the new css_set and store it in the list in preparation for moving the + * given task to the given cgroup. Returns 0 or -ENOMEM. + */ +static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, + struct list_head *newcg_list) +{ + struct css_set *newcg; + struct cg_list_entry *cg_entry; + + /* ensure a new css_set will exist for this thread */ + newcg = find_css_set(cg, cgrp); + if (!newcg) + return -ENOMEM; + /* add it to the list */ + cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); + if (!cg_entry) { + put_css_set(newcg); + return -ENOMEM; + } + cg_entry->cg = newcg; + list_add(&cg_entry->links, newcg_list); + return 0; +} + +/** + * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup + * @cgrp: the cgroup to attach to + * @leader: the threadgroup leader task_struct of the group to be attached + * + * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will + * take task_lock of each thread in leader's threadgroup individually in turn. + */ +int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +{ + int retval, i, group_size; + struct cgroup_subsys *ss, *failed_ss = NULL; + bool cancel_failed_ss = false; + /* guaranteed to be initialized later, but the compiler needs this */ + struct cgroup *oldcgrp = NULL; + struct css_set *oldcg; + struct cgroupfs_root *root = cgrp->root; + /* threadgroup list cursor and array */ + struct task_struct *tsk; + struct flex_array *group; + /* + * we need to make sure we have css_sets for all the tasks we're + * going to move -before- we actually start moving them, so that in + * case we get an ENOMEM we can bail out before making any changes. + */ + struct list_head newcg_list; + struct cg_list_entry *cg_entry, *temp_nobe; + + /* + * step 0: in order to do expensive, possibly blocking operations for + * every thread, we cannot iterate the thread group list, since it needs + * rcu or tasklist locked. instead, build an array of all threads in the + * group - threadgroup_fork_lock prevents new threads from appearing, + * and if threads exit, this will just be an over-estimate. + */ + group_size = get_nr_threads(leader); + /* flex_array supports very large thread-groups better than kmalloc. */ + group = flex_array_alloc(sizeof(struct task_struct *), group_size, + GFP_KERNEL); + if (!group) + return -ENOMEM; + /* pre-allocate to guarantee space while iterating in rcu read-side. */ + retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); + if (retval) + goto out_free_group_list; + + /* prevent changes to the threadgroup list while we take a snapshot. */ + rcu_read_lock(); + if (!thread_group_leader(leader)) { + /* + * a race with de_thread from another thread's exec() may strip + * us of our leadership, making while_each_thread unsafe to use + * on this task. if this happens, there is no choice but to + * throw this task away and try again (from cgroup_procs_write); + * this is "double-double-toil-and-trouble-check locking". + */ + rcu_read_unlock(); + retval = -EAGAIN; + goto out_free_group_list; + } + /* take a reference on each task in the group to go in the array. */ + tsk = leader; + i = 0; + do { + /* as per above, nr_threads may decrease, but not increase. */ + BUG_ON(i >= group_size); + get_task_struct(tsk); + /* + * saying GFP_ATOMIC has no effect here because we did prealloc + * earlier, but it's good form to communicate our expectations. + */ + retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); + BUG_ON(retval != 0); + i++; + } while_each_thread(leader, tsk); + /* remember the number of threads in the array for later. */ + group_size = i; + rcu_read_unlock(); + + /* + * step 1: check that we can legitimately attach to the cgroup. + */ + for_each_subsys(root, ss) { + if (ss->can_attach) { + retval = ss->can_attach(ss, cgrp, leader); + if (retval) { + failed_ss = ss; + goto out_cancel_attach; + } + } + /* a callback to be run on every thread in the threadgroup. */ + if (ss->can_attach_task) { + /* run on each task in the threadgroup. */ + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + retval = ss->can_attach_task(cgrp, tsk); + if (retval) { + failed_ss = ss; + cancel_failed_ss = true; + goto out_cancel_attach; + } + } + } + } + + /* + * step 2: make sure css_sets exist for all threads to be migrated. + * we use find_css_set, which allocates a new one if necessary. + */ + INIT_LIST_HEAD(&newcg_list); + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + /* nothing to do if this task is already in the cgroup */ + oldcgrp = task_cgroup_from_root(tsk, root); + if (cgrp == oldcgrp) + continue; + /* get old css_set pointer */ + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + /* ignore this task if it's going away */ + task_unlock(tsk); + continue; + } + oldcg = tsk->cgroups; + get_css_set(oldcg); + task_unlock(tsk); + /* see if the new one for us is already in the list? */ + if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { + /* was already there, nothing to do. */ + put_css_set(oldcg); + } else { + /* we don't already have it. get new one. */ + retval = css_set_prefetch(cgrp, oldcg, &newcg_list); + put_css_set(oldcg); + if (retval) + goto out_list_teardown; + } + } + + /* + * step 3: now that we're guaranteed success wrt the css_sets, proceed + * to move all tasks to the new cgroup, calling ss->attach_task for each + * one along the way. there are no failure cases after here, so this is + * the commit point. + */ + for_each_subsys(root, ss) { + if (ss->pre_attach) + ss->pre_attach(cgrp); + } + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + /* leave current thread as it is if it's already there */ + oldcgrp = task_cgroup_from_root(tsk, root); + if (cgrp == oldcgrp) + continue; + /* attach each task to each subsystem */ + for_each_subsys(root, ss) { + if (ss->attach_task) + ss->attach_task(cgrp, tsk); + } + /* if the thread is PF_EXITING, it can just get skipped. */ + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); + BUG_ON(retval != 0 && retval != -ESRCH); + } + /* nothing is sensitive to fork() after this point. */ + + /* + * step 4: do expensive, non-thread-specific subsystem callbacks. + * TODO: if ever a subsystem needs to know the oldcgrp for each task + * being moved, this call will need to be reworked to communicate that. + */ + for_each_subsys(root, ss) { + if (ss->attach) + ss->attach(ss, cgrp, oldcgrp, leader); + } + + /* + * step 5: success! and cleanup + */ + synchronize_rcu(); + cgroup_wakeup_rmdir_waiter(cgrp); + retval = 0; +out_list_teardown: + /* clean up the list of prefetched css_sets. */ + list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { + list_del(&cg_entry->links); + put_css_set(cg_entry->cg); + kfree(cg_entry); + } +out_cancel_attach: + /* same deal as in cgroup_attach_task */ + if (retval) { + for_each_subsys(root, ss) { + if (ss == failed_ss) { + if (cancel_failed_ss && ss->cancel_attach) + ss->cancel_attach(ss, cgrp, leader); + break; + } + if (ss->cancel_attach) + ss->cancel_attach(ss, cgrp, leader); + } + } + /* clean up the array of referenced threads in the group. */ + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + put_task_struct(tsk); + } +out_free_group_list: + flex_array_free(group); + return retval; +} + +/* + * Find the task_struct of the task to attach by vpid and pass it along to the + * function to attach either it or all tasks in its threadgroup. Will take + * cgroup_mutex; may take task_lock of task. */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) +static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; int ret; + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + if (pid) { rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk || tsk->flags & PF_EXITING) { + if (!tsk) { rcu_read_unlock(); + cgroup_unlock(); + return -ESRCH; + } + if (threadgroup) { + /* + * RCU protects this access, since tsk was found in the + * tid map. a race with de_thread may cause group_leader + * to stop being the leader, but cgroup_attach_proc will + * detect it later. + */ + tsk = tsk->group_leader; + } else if (tsk->flags & PF_EXITING) { + /* optimization for the single-task-only case */ + rcu_read_unlock(); + cgroup_unlock(); return -ESRCH; } + /* + * even if we're attaching all tasks in the thread group, we + * only need to check permissions on one of them. + */ tcred = __task_cred(tsk); if (cred->euid && cred->euid != tcred->uid && cred->euid != tcred->suid) { rcu_read_unlock(); + cgroup_unlock(); return -EACCES; } get_task_struct(tsk); rcu_read_unlock(); } else { - tsk = current; + if (threadgroup) + tsk = current->group_leader; + else + tsk = current; get_task_struct(tsk); } - ret = cgroup_attach_task(cgrp, tsk); + if (threadgroup) { + threadgroup_fork_write_lock(tsk); + ret = cgroup_attach_proc(cgrp, tsk); + threadgroup_fork_write_unlock(tsk); + } else { + ret = cgroup_attach_task(cgrp, tsk); + } put_task_struct(tsk); + cgroup_unlock(); return ret; } static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) { + return attach_task_by_pid(cgrp, pid, false); +} + +static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) +{ int ret; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - ret = attach_task_by_pid(cgrp, pid); - cgroup_unlock(); + do { + /* + * attach_proc fails with -EAGAIN if threadgroup leadership + * changes in the middle of the operation, in which case we need + * to find the task_struct for the new leader and start over. + */ + ret = attach_task_by_pid(cgrp, tgid, true); + } while (ret == -EAGAIN); return ret; } @@ -3259,9 +3632,9 @@ static struct cftype files[] = { { .name = CGROUP_FILE_GENERIC_PREFIX "procs", .open = cgroup_procs_open, - /* .write_u64 = cgroup_procs_write, TODO */ + .write_u64 = cgroup_procs_write, .release = cgroup_pidlist_release, - .mode = S_IRUGO, + .mode = S_IRUGO | S_IWUSR, }, { .name = "notify_on_release", @@ -4257,122 +4630,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } /** - * cgroup_clone - clone the cgroup the given subsystem is attached to - * @tsk: the task to be moved - * @subsys: the given subsystem - * @nodename: the name for the new cgroup - * - * Duplicate the current cgroup in the hierarchy that the given - * subsystem is attached to, and move this task into the new - * child. - */ -int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, - char *nodename) -{ - struct dentry *dentry; - int ret = 0; - struct cgroup *parent, *child; - struct inode *inode; - struct css_set *cg; - struct cgroupfs_root *root; - struct cgroup_subsys *ss; - - /* We shouldn't be called by an unregistered subsystem */ - BUG_ON(!subsys->active); - - /* First figure out what hierarchy and cgroup we're dealing - * with, and pin them so we can drop cgroup_mutex */ - mutex_lock(&cgroup_mutex); - again: - root = subsys->root; - if (root == &rootnode) { - mutex_unlock(&cgroup_mutex); - return 0; - } - - /* Pin the hierarchy */ - if (!atomic_inc_not_zero(&root->sb->s_active)) { - /* We race with the final deactivate_super() */ - mutex_unlock(&cgroup_mutex); - return 0; - } - - /* Keep the cgroup alive */ - task_lock(tsk); - parent = task_cgroup(tsk, subsys->subsys_id); - cg = tsk->cgroups; - get_css_set(cg); - task_unlock(tsk); - - mutex_unlock(&cgroup_mutex); - - /* Now do the VFS work to create a cgroup */ - inode = parent->dentry->d_inode; - - /* Hold the parent directory mutex across this operation to - * stop anyone else deleting the new cgroup */ - mutex_lock(&inode->i_mutex); - dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); - if (IS_ERR(dentry)) { - printk(KERN_INFO - "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, - PTR_ERR(dentry)); - ret = PTR_ERR(dentry); - goto out_release; - } - - /* Create the cgroup directory, which also creates the cgroup */ - ret = vfs_mkdir(inode, dentry, 0755); - child = __d_cgrp(dentry); - dput(dentry); - if (ret) { - printk(KERN_INFO - "Failed to create cgroup %s: %d\n", nodename, - ret); - goto out_release; - } - - /* The cgroup now exists. Retake cgroup_mutex and check - * that we're still in the same state that we thought we - * were. */ - mutex_lock(&cgroup_mutex); - if ((root != subsys->root) || - (parent != task_cgroup(tsk, subsys->subsys_id))) { - /* Aargh, we raced ... */ - mutex_unlock(&inode->i_mutex); - put_css_set(cg); - - deactivate_super(root->sb); - /* The cgroup is still accessible in the VFS, but - * we're not going to try to rmdir() it at this - * point. */ - printk(KERN_INFO - "Race in cgroup_clone() - leaking cgroup %s\n", - nodename); - goto again; - } - - /* do any required auto-setup */ - for_each_subsys(root, ss) { - if (ss->post_clone) - ss->post_clone(ss, child); - } - - /* All seems fine. Finish by moving the task into the new cgroup */ - ret = cgroup_attach_task(child, tsk); - mutex_unlock(&cgroup_mutex); - - out_release: - mutex_unlock(&inode->i_mutex); - - mutex_lock(&cgroup_mutex); - put_css_set(cg); - mutex_unlock(&cgroup_mutex); - deactivate_super(root->sb); - return ret; -} - -/** * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp * @cgrp: the cgroup in question * @task: the task in question |