diff options
56 files changed, 2087 insertions, 2840 deletions
diff --git a/arch/s390/include/asm/ccwgroup.h b/arch/s390/include/asm/ccwgroup.h index 23723ce5ca7..6e670f88d12 100644 --- a/arch/s390/include/asm/ccwgroup.h +++ b/arch/s390/include/asm/ccwgroup.h @@ -23,6 +23,7 @@ struct ccwgroup_device { unsigned int count; struct device dev; struct ccw_device *cdev[0]; + struct work_struct ungroup_work; }; /** diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c index cf8a12ff733..ab4a9139300 100644 --- a/arch/s390/pci/pci_sysfs.c +++ b/arch/s390/pci/pci_sysfs.c @@ -48,29 +48,27 @@ static ssize_t show_pfgid(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR(pfgid, S_IRUGO, show_pfgid, NULL); -static void recover_callback(struct device *dev) +static ssize_t store_recover(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { struct pci_dev *pdev = to_pci_dev(dev); struct zpci_dev *zdev = get_zdev(pdev); int ret; + if (!device_remove_file_self(dev, attr)) + return count; + pci_stop_and_remove_bus_device(pdev); ret = zpci_disable_device(zdev); if (ret) - return; + return ret; ret = zpci_enable_device(zdev); if (ret) - return; + return ret; pci_rescan_bus(zdev->bus); -} - -static ssize_t store_recover(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - int rc = device_schedule_callback(dev, recover_callback); - return rc ? rc : count; + return count; } static DEVICE_ATTR(recover, S_IWUSR, NULL, store_recover); diff --git a/arch/sparc/kernel/leon_pci_grpci2.c b/arch/sparc/kernel/leon_pci_grpci2.c index 5f0402aab7f..24d6a444634 100644 --- a/arch/sparc/kernel/leon_pci_grpci2.c +++ b/arch/sparc/kernel/leon_pci_grpci2.c @@ -8,6 +8,7 @@ #include <linux/of_device.h> #include <linux/kernel.h> #include <linux/pci.h> +#include <linux/slab.h> #include <linux/delay.h> #include <linux/export.h> #include <asm/io.h> diff --git a/arch/sparc/kernel/sun4m_irq.c b/arch/sparc/kernel/sun4m_irq.c index c5ade9d27a1..8bb3b3fddea 100644 --- a/arch/sparc/kernel/sun4m_irq.c +++ b/arch/sparc/kernel/sun4m_irq.c @@ -9,6 +9,8 @@ * Copyright (C) 1996 Dave Redman (djhr@tadpole.co.uk) */ +#include <linux/slab.h> + #include <asm/timer.h> #include <asm/traps.h> #include <asm/pgalloc.h> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4e491d9b529..4aefd46d7d9 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -894,7 +894,7 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) @@ -906,17 +906,14 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, return ret; } -struct cgroup_subsys blkio_subsys = { - .name = "blkio", +struct cgroup_subsys blkio_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, - .subsys_id = blkio_subsys_id, .base_cftypes = blkcg_files, - .module = THIS_MODULE, }; -EXPORT_SYMBOL_GPL(blkio_subsys); +EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a request_queue @@ -1106,7 +1103,7 @@ int blkcg_policy_register(struct blkcg_policy *pol) /* everything is in place, add intf files for the new policy */ if (pol->cftypes) - WARN_ON(cgroup_add_cftypes(&blkio_subsys, pol->cftypes)); + WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes)); ret = 0; out_unlock: mutex_unlock(&blkcg_pol_mutex); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 86154eab952..15a8d640de5 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -186,7 +186,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) static inline struct blkcg *task_blkcg(struct task_struct *tsk) { - return css_to_blkcg(task_css(tsk, blkio_subsys_id)); + return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); } static inline struct blkcg *bio_blkcg(struct bio *bio) @@ -241,12 +241,16 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) */ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) { - int ret; + char *p; - ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); - if (ret) + p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); + if (!p) { strncpy(buf, "<unavailable>", buflen); - return ret; + return -ENAMETOOLONG; + } + + memmove(buf, p, buf + buflen - p); + return 0; } /** diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 1474c3ab7e7..861c363e412 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1425,28 +1425,24 @@ static struct cftype throtl_files[] = { .private = offsetof(struct throtl_grp, bps[READ]), .seq_show = tg_print_conf_u64, .write_string = tg_set_conf_u64, - .max_write_len = 256, }, { .name = "throttle.write_bps_device", .private = offsetof(struct throtl_grp, bps[WRITE]), .seq_show = tg_print_conf_u64, .write_string = tg_set_conf_u64, - .max_write_len = 256, }, { .name = "throttle.read_iops_device", .private = offsetof(struct throtl_grp, iops[READ]), .seq_show = tg_print_conf_uint, .write_string = tg_set_conf_uint, - .max_write_len = 256, }, { .name = "throttle.write_iops_device", .private = offsetof(struct throtl_grp, iops[WRITE]), .seq_show = tg_print_conf_uint, .write_string = tg_set_conf_uint, - .max_write_len = 256, }, { .name = "throttle.io_service_bytes", diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 744833b630c..46118794339 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1838,7 +1838,6 @@ static struct cftype cfq_blkcg_files[] = { .flags = CFTYPE_ONLY_ON_ROOT, .seq_show = cfqg_print_leaf_weight_device, .write_string = cfqg_set_leaf_weight_device, - .max_write_len = 256, }, { .name = "weight", @@ -1853,7 +1852,6 @@ static struct cftype cfq_blkcg_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cfqg_print_weight_device, .write_string = cfqg_set_weight_device, - .max_write_len = 256, }, { .name = "weight", @@ -1866,7 +1864,6 @@ static struct cftype cfq_blkcg_files[] = { .name = "leaf_weight_device", .seq_show = cfqg_print_leaf_weight_device, .write_string = cfqg_set_leaf_weight_device, - .max_write_len = 256, }, { .name = "leaf_weight", diff --git a/drivers/base/core.c b/drivers/base/core.c index 2b567177ef7..4195364f9fd 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -571,6 +571,23 @@ void device_remove_file(struct device *dev, EXPORT_SYMBOL_GPL(device_remove_file); /** + * device_remove_file_self - remove sysfs attribute file from its own method. + * @dev: device. + * @attr: device attribute descriptor. + * + * See kernfs_remove_self() for details. + */ +bool device_remove_file_self(struct device *dev, + const struct device_attribute *attr) +{ + if (dev) + return sysfs_remove_file_self(&dev->kobj, &attr->attr); + else + return false; +} +EXPORT_SYMBOL_GPL(device_remove_file_self); + +/** * device_create_bin_file - create sysfs binary attribute file for device. * @dev: device. * @attr: device binary attribute descriptor. @@ -598,39 +615,6 @@ void device_remove_bin_file(struct device *dev, } EXPORT_SYMBOL_GPL(device_remove_bin_file); -/** - * device_schedule_callback_owner - helper to schedule a callback for a device - * @dev: device. - * @func: callback function to invoke later. - * @owner: module owning the callback routine - * - * Attribute methods must not unregister themselves or their parent device - * (which would amount to the same thing). Attempts to do so will deadlock, - * since unregistration is mutually exclusive with driver callbacks. - * - * Instead methods can call this routine, which will attempt to allocate - * and schedule a workqueue request to call back @func with @dev as its - * argument in the workqueue's process context. @dev will be pinned until - * @func returns. - * - * This routine is usually called via the inline device_schedule_callback(), - * which automatically sets @owner to THIS_MODULE. - * - * Returns 0 if the request was submitted, -ENOMEM if storage could not - * be allocated, -ENODEV if a reference to @owner isn't available. - * - * NOTE: This routine won't work if CONFIG_SYSFS isn't set! It uses an - * underlying sysfs routine (since it is intended for use by attribute - * methods), and if sysfs isn't available you'll get nothing but -ENOSYS. - */ -int device_schedule_callback_owner(struct device *dev, - void (*func)(struct device *), struct module *owner) -{ - return sysfs_schedule_callback(&dev->kobj, - (void (*)(void *)) func, dev, owner); -} -EXPORT_SYMBOL_GPL(device_schedule_callback_owner); - static void klist_children_get(struct klist_node *n) { struct device_private *p = to_device_private_parent(n); diff --git a/drivers/base/dma-buf.c b/drivers/base/dma-buf.c index 1e16cbd61da..cfe1d8bc7bb 100644 --- a/drivers/base/dma-buf.c +++ b/drivers/base/dma-buf.c @@ -251,9 +251,8 @@ EXPORT_SYMBOL_GPL(dma_buf_put); * @dmabuf: [in] buffer to attach device to. * @dev: [in] device to be attached. * - * Returns struct dma_buf_attachment * for this attachment; may return negative - * error codes. - * + * Returns struct dma_buf_attachment * for this attachment; returns ERR_PTR on + * error. */ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf, struct device *dev) @@ -319,9 +318,8 @@ EXPORT_SYMBOL_GPL(dma_buf_detach); * @attach: [in] attachment whose scatterlist is to be returned * @direction: [in] direction of DMA transfer * - * Returns sg_table containing the scatterlist to be returned; may return NULL - * or ERR_PTR. - * + * Returns sg_table containing the scatterlist to be returned; returns ERR_PTR + * on error. */ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach, enum dma_data_direction direction) @@ -334,6 +332,8 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach, return ERR_PTR(-EINVAL); sg_table = attach->dmabuf->ops->map_dma_buf(attach, direction); + if (!sg_table) + sg_table = ERR_PTR(-ENOMEM); return sg_table; } @@ -544,6 +544,8 @@ EXPORT_SYMBOL_GPL(dma_buf_mmap); * These calls are optional in drivers. The intended use for them * is for mapping objects linear in kernel space for high use objects. * Please attempt to use kmap/kunmap before thinking about these interfaces. + * + * Returns NULL on error. */ void *dma_buf_vmap(struct dma_buf *dmabuf) { @@ -566,7 +568,9 @@ void *dma_buf_vmap(struct dma_buf *dmabuf) BUG_ON(dmabuf->vmap_ptr); ptr = dmabuf->ops->vmap(dmabuf); - if (IS_ERR_OR_NULL(ptr)) + if (WARN_ON_ONCE(IS_ERR(ptr))) + ptr = NULL; + if (!ptr) goto out_unlock; dmabuf->vmap_ptr = ptr; diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c index 56805c39c90..bb516fdd195 100644 --- a/drivers/gpu/drm/drm_prime.c +++ b/drivers/gpu/drm/drm_prime.c @@ -471,7 +471,7 @@ struct drm_gem_object *drm_gem_prime_import(struct drm_device *dev, get_dma_buf(dma_buf); sgt = dma_buf_map_attachment(attach, DMA_BIDIRECTIONAL); - if (IS_ERR_OR_NULL(sgt)) { + if (IS_ERR(sgt)) { ret = PTR_ERR(sgt); goto fail_detach; } diff --git a/drivers/gpu/drm/exynos/exynos_drm_dmabuf.c b/drivers/gpu/drm/exynos/exynos_drm_dmabuf.c index 59827cc5e77..c786cd4f457 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_dmabuf.c +++ b/drivers/gpu/drm/exynos/exynos_drm_dmabuf.c @@ -224,7 +224,7 @@ struct drm_gem_object *exynos_dmabuf_prime_import(struct drm_device *drm_dev, get_dma_buf(dma_buf); sgt = dma_buf_map_attachment(attach, DMA_BIDIRECTIONAL); - if (IS_ERR_OR_NULL(sgt)) { + if (IS_ERR(sgt)) { ret = PTR_ERR(sgt); goto err_buf_detach; } diff --git a/drivers/media/v4l2-core/videobuf2-dma-contig.c b/drivers/media/v4l2-core/videobuf2-dma-contig.c index 33d3871d1e1..880be0782dd 100644 --- a/drivers/media/v4l2-core/videobuf2-dma-contig.c +++ b/drivers/media/v4l2-core/videobuf2-dma-contig.c @@ -719,7 +719,7 @@ static int vb2_dc_map_dmabuf(void *mem_priv) /* get the associated scatterlist for this buffer */ sgt = dma_buf_map_attachment(buf->db_attach, buf->dma_dir); - if (IS_ERR_OR_NULL(sgt)) { + if (IS_ERR(sgt)) { pr_err("Error getting dmabuf scatterlist\n"); return -EINVAL; } diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index 276ef9c1880..4e0acefb756 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -351,28 +351,17 @@ static struct device_attribute dev_rescan_attr = __ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, dev_rescan_store); -static void remove_callback(struct device *dev) -{ - pci_stop_and_remove_bus_device_locked(to_pci_dev(dev)); -} - static ssize_t -remove_store(struct device *dev, struct device_attribute *dummy, +remove_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - int ret = 0; unsigned long val; if (kstrtoul(buf, 0, &val) < 0) return -EINVAL; - /* An attribute cannot be unregistered by one of its own methods, - * so we have to use this roundabout approach. - */ - if (val) - ret = device_schedule_callback(dev, remove_callback); - if (ret) - count = ret; + if (val && device_remove_file_self(dev, attr)) + pci_stop_and_remove_bus_device_locked(to_pci_dev(dev)); return count; } static struct device_attribute dev_remove_attr = __ATTR(remove, diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index ebf41e228e5..ee0e85abe1f 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -304,12 +304,6 @@ dcssblk_load_segment(char *name, struct segment_info **seg_info) return rc; } -static void dcssblk_unregister_callback(struct device *dev) -{ - device_unregister(dev); - put_device(dev); -} - /* * device attribute for switching shared/nonshared (exclusive) * operation (show + store) @@ -397,7 +391,13 @@ removeseg: blk_cleanup_queue(dev_info->dcssblk_queue); dev_info->gd->queue = NULL; put_disk(dev_info->gd); - rc = device_schedule_callback(dev, dcssblk_unregister_callback); + up_write(&dcssblk_devices_sem); + + if (device_remove_file_self(dev, attr)) { + device_unregister(dev); + put_device(dev); + } + return rc; out: up_write(&dcssblk_devices_sem); return rc; diff --git a/drivers/s390/cio/ccwgroup.c b/drivers/s390/cio/ccwgroup.c index fd3367a1dc7..dfd7bc681c2 100644 --- a/drivers/s390/cio/ccwgroup.c +++ b/drivers/s390/cio/ccwgroup.c @@ -168,14 +168,12 @@ static ssize_t ccwgroup_online_show(struct device *dev, * Provide an 'ungroup' attribute so the user can remove group devices no * longer needed or accidentially created. Saves memory :) */ -static void ccwgroup_ungroup_callback(struct device *dev) +static void ccwgroup_ungroup(struct ccwgroup_device *gdev) { - struct ccwgroup_device *gdev = to_ccwgroupdev(dev); - mutex_lock(&gdev->reg_mutex); if (device_is_registered(&gdev->dev)) { __ccwgroup_remove_symlinks(gdev); - device_unregister(dev); + device_unregister(&gdev->dev); __ccwgroup_remove_cdev_refs(gdev); } mutex_unlock(&gdev->reg_mutex); @@ -195,10 +193,9 @@ static ssize_t ccwgroup_ungroup_store(struct device *dev, rc = -EINVAL; goto out; } - /* Note that we cannot unregister the device from one of its - * attribute methods, so we have to use this roundabout approach. - */ - rc = device_schedule_callback(dev, ccwgroup_ungroup_callback); + + if (device_remove_file_self(dev, attr)) + ccwgroup_ungroup(gdev); out: if (rc) { if (rc != -EAGAIN) @@ -224,6 +221,14 @@ static const struct attribute_group *ccwgroup_attr_groups[] = { NULL, }; +static void ccwgroup_ungroup_workfn(struct work_struct *work) +{ + struct ccwgroup_device *gdev = + container_of(work, struct ccwgroup_device, ungroup_work); + + ccwgroup_ungroup(gdev); +} + static void ccwgroup_release(struct device *dev) { kfree(to_ccwgroupdev(dev)); @@ -323,6 +328,7 @@ int ccwgroup_create_dev(struct device *parent, struct ccwgroup_driver *gdrv, atomic_set(&gdev->onoff, 0); mutex_init(&gdev->reg_mutex); mutex_lock(&gdev->reg_mutex); + INIT_WORK(&gdev->ungroup_work, ccwgroup_ungroup_workfn); gdev->count = num_devices; gdev->dev.bus = &ccwgroup_bus_type; gdev->dev.parent = parent; @@ -404,10 +410,10 @@ EXPORT_SYMBOL(ccwgroup_create_dev); static int ccwgroup_notifier(struct notifier_block *nb, unsigned long action, void *data) { - struct device *dev = data; + struct ccwgroup_device *gdev = to_ccwgroupdev(data); if (action == BUS_NOTIFY_UNBIND_DRIVER) - device_schedule_callback(dev, ccwgroup_ungroup_callback); + schedule_work(&gdev->ungroup_work); return NOTIFY_OK; } diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 9117d0bf408..8ead24c3453 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -649,23 +649,12 @@ store_rescan_field (struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR(rescan, S_IWUSR, NULL, store_rescan_field); -static void sdev_store_delete_callback(struct device *dev) -{ - scsi_remove_device(to_scsi_device(dev)); -} - static ssize_t sdev_store_delete(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - int rc; - - /* An attribute cannot be unregistered by one of its own methods, - * so we have to use this roundabout approach. - */ - rc = device_schedule_callback(dev, sdev_store_delete_callback); - if (rc) - count = rc; + if (device_remove_file_self(dev, attr)) + scsi_remove_device(to_scsi_device(dev)); return count; }; static DEVICE_ATTR(delete, S_IWUSR, NULL, sdev_store_delete); diff --git a/fs/Kconfig b/fs/Kconfig index 7385e54be4b..312393f3294 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -96,6 +96,7 @@ endif # BLOCK menu "Pseudo filesystems" source "fs/proc/Kconfig" +source "fs/kernfs/Kconfig" source "fs/sysfs/Kconfig" config TMPFS diff --git a/fs/Makefile b/fs/Makefile index 47ac07bb4ac..f9cb9876e46 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -52,7 +52,8 @@ obj-$(CONFIG_FHANDLE) += fhandle.o obj-y += quota/ obj-$(CONFIG_PROC_FS) += proc/ -obj-$(CONFIG_SYSFS) += sysfs/ kernfs/ +obj-$(CONFIG_KERNFS) += kernfs/ +obj-$(CONFIG_SYSFS) += sysfs/ obj-$(CONFIG_CONFIGFS_FS) += configfs/ obj-y += devpts/ @@ -1965,7 +1965,7 @@ int bio_associate_current(struct bio *bio) /* associate blkcg if exists */ rcu_read_lock(); - css = task_css(current, blkio_subsys_id); + css = task_css(current, blkio_cgrp_id); if (css && css_tryget(css)) bio->bi_css = css; rcu_read_unlock(); diff --git a/fs/kernfs/Kconfig b/fs/kernfs/Kconfig new file mode 100644 index 00000000000..397b5f7a7a1 --- /dev/null +++ b/fs/kernfs/Kconfig @@ -0,0 +1,7 @@ +# +# KERNFS should be selected by its users +# + +config KERNFS + bool + default n diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 5104cf5d25c..939684ebff1 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -8,6 +8,7 @@ * This file is released under the GPLv2. */ +#include <linux/sched.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/idr.h> @@ -18,9 +19,162 @@ #include "kernfs-internal.h" DEFINE_MUTEX(kernfs_mutex); +static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ +static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) +static bool kernfs_active(struct kernfs_node *kn) +{ + lockdep_assert_held(&kernfs_mutex); + return atomic_read(&kn->active) >= 0; +} + +static bool kernfs_lockdep(struct kernfs_node *kn) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + return kn->flags & KERNFS_LOCKDEP; +#else + return false; +#endif +} + +static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen) +{ + return strlcpy(buf, kn->parent ? kn->name : "/", buflen); +} + +static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf, + size_t buflen) +{ + char *p = buf + buflen; + int len; + + *--p = '\0'; + + do { + len = strlen(kn->name); + if (p - buf < len + 1) { + buf[0] = '\0'; + p = NULL; + break; + } + p -= len; + memcpy(p, kn->name, len); + *--p = '/'; + kn = kn->parent; + } while (kn && kn->parent); + + return p; +} + +/** + * kernfs_name - obtain the name of a given node + * @kn: kernfs_node of interest + * @buf: buffer to copy @kn's name into + * @buflen: size of @buf + * + * Copies the name of @kn into @buf of @buflen bytes. The behavior is + * similar to strlcpy(). It returns the length of @kn's name and if @buf + * isn't long enough, it's filled upto @buflen-1 and nul terminated. + * + * This function can be called from any context. + */ +int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + ret = kernfs_name_locked(kn, buf, buflen); + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + return ret; +} + +/** + * kernfs_path - build full path of a given node + * @kn: kernfs_node of interest + * @buf: buffer to copy @kn's name into + * @buflen: size of @buf + * + * Builds and returns the full path of @kn in @buf of @buflen bytes. The + * path is built from the end of @buf so the returned pointer usually + * doesn't match @buf. If @buf isn't long enough, @buf is nul terminated + * and %NULL is returned. + */ +char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) +{ + unsigned long flags; + char *p; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + p = kernfs_path_locked(kn, buf, buflen); + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + return p; +} +EXPORT_SYMBOL_GPL(kernfs_path); + +/** + * pr_cont_kernfs_name - pr_cont name of a kernfs_node + * @kn: kernfs_node of interest + * + * This function can be called from any context. + */ +void pr_cont_kernfs_name(struct kernfs_node *kn) +{ + unsigned long flags; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + + kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); + pr_cont("%s", kernfs_pr_cont_buf); + + spin_unlock_irqrestore(&kernfs_rename_lock, flags); +} + +/** + * pr_cont_kernfs_path - pr_cont path of a kernfs_node + * @kn: kernfs_node of interest + * + * This function can be called from any context. + */ +void pr_cont_kernfs_path(struct kernfs_node *kn) +{ + unsigned long flags; + char *p; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + + p = kernfs_path_locked(kn, kernfs_pr_cont_buf, + sizeof(kernfs_pr_cont_buf)); + if (p) + pr_cont("%s", p); + else + pr_cont("<name too long>"); + + spin_unlock_irqrestore(&kernfs_rename_lock, flags); +} + +/** + * kernfs_get_parent - determine the parent node and pin it + * @kn: kernfs_node of interest + * + * Determines @kn's parent, pins and returns it. This function can be + * called from any context. + */ +struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) +{ + struct kernfs_node *parent; + unsigned long flags; + + spin_lock_irqsave(&kernfs_rename_lock, flags); + parent = kn->parent; + kernfs_get(parent); + spin_unlock_irqrestore(&kernfs_rename_lock, flags); + + return parent; +} + /** * kernfs_name_hash * @name: Null terminated string to hash @@ -105,18 +259,24 @@ static int kernfs_link_sibling(struct kernfs_node *kn) * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree * @kn: kernfs_node of interest * - * Unlink @kn from its sibling rbtree which starts from - * kn->parent->dir.children. + * Try to unlink @kn from its sibling rbtree which starts from + * kn->parent->dir.children. Returns %true if @kn was actually + * removed, %false if @kn wasn't on the rbtree. * * Locking: * mutex_lock(kernfs_mutex) */ -static void kernfs_unlink_sibling(struct kernfs_node *kn) +static bool kernfs_unlink_sibling(struct kernfs_node *kn) { + if (RB_EMPTY_NODE(&kn->rb)) + return false; + if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs--; rb_erase(&kn->rb, &kn->parent->dir.children); + RB_CLEAR_NODE(&kn->rb); + return true; } /** @@ -137,7 +297,7 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) if (!atomic_inc_unless_negative(&kn->active)) return NULL; - if (kn->flags & KERNFS_LOCKDEP) + if (kernfs_lockdep(kn)) rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); return kn; } @@ -151,55 +311,57 @@ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) */ void kernfs_put_active(struct kernfs_node *kn) { + struct kernfs_root *root = kernfs_root(kn); int v; if (unlikely(!kn)) return; - if (kn->flags & KERNFS_LOCKDEP) + if (kernfs_lockdep(kn)) rwsem_release(&kn->dep_map, 1, _RET_IP_); v = atomic_dec_return(&kn->active); if (likely(v != KN_DEACTIVATED_BIAS)) return; - /* - * atomic_dec_return() is a mb(), we'll always see the updated - * kn->u.completion. - */ - complete(kn->u.completion); + wake_up_all(&root->deactivate_waitq); } /** - * kernfs_deactivate - deactivate kernfs_node - * @kn: kernfs_node to deactivate + * kernfs_drain - drain kernfs_node + * @kn: kernfs_node to drain * - * Deny new active references and drain existing ones. + * Drain existing usages and nuke all existing mmaps of @kn. Mutiple + * removers may invoke this function concurrently on @kn and all will + * return after draining is complete. */ -static void kernfs_deactivate(struct kernfs_node *kn) +static void kernfs_drain(struct kernfs_node *kn) + __releases(&kernfs_mutex) __acquires(&kernfs_mutex) { - DECLARE_COMPLETION_ONSTACK(wait); - int v; + struct kernfs_root *root = kernfs_root(kn); - BUG_ON(!(kn->flags & KERNFS_REMOVED)); + lockdep_assert_held(&kernfs_mutex); + WARN_ON_ONCE(kernfs_active(kn)); - if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF)) - return; + mutex_unlock(&kernfs_mutex); - kn->u.completion = (void *)&wait; + if (kernfs_lockdep(kn)) { + rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); + if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS) + lock_contended(&kn->dep_map, _RET_IP_); + } - rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); - /* atomic_add_return() is a mb(), put_active() will always see - * the updated kn->u.completion. - */ - v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active); + /* but everyone should wait for draining */ + wait_event(root->deactivate_waitq, + atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); - if (v != KN_DEACTIVATED_BIAS) { - lock_contended(&kn->dep_map, _RET_IP_); - wait_for_completion(&wait); + if (kernfs_lockdep(kn)) { + lock_acquired(&kn->dep_map, _RET_IP_); + rwsem_release(&kn->dep_map, 1, _RET_IP_); } - lock_acquired(&kn->dep_map, _RET_IP_); - rwsem_release(&kn->dep_map, 1, _RET_IP_); + kernfs_unmap_bin_file(kn); + + mutex_lock(&kernfs_mutex); } /** @@ -230,13 +392,15 @@ void kernfs_put(struct kernfs_node *kn) return; root = kernfs_root(kn); repeat: - /* Moving/renaming is always done while holding reference. + /* + * Moving/renaming is always done while holding reference. * kn->parent won't change beneath us. */ parent = kn->parent; - WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n", - parent ? parent->name : "", kn->name); + WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, + "kernfs_put: %s/%s: released with incorrect active_ref %d\n", + parent ? parent->name : "", kn->name, atomic_read(&kn->active)); if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); @@ -278,8 +442,8 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) kn = dentry->d_fsdata; mutex_lock(&kernfs_mutex); - /* The kernfs node has been deleted */ - if (kn->flags & KERNFS_REMOVED) + /* The kernfs node has been deactivated */ + if (!kernfs_active(kn)) goto out_bad; /* The kernfs node has been moved? */ @@ -324,6 +488,24 @@ const struct dentry_operations kernfs_dops = { .d_release = kernfs_dop_release, }; +/** + * kernfs_node_from_dentry - determine kernfs_node associated with a dentry + * @dentry: the dentry in question + * + * Return the kernfs_node associated with @dentry. If @dentry is not a + * kernfs one, %NULL is returned. + * + * While the returned kernfs_node will stay accessible as long as @dentry + * is accessible, the returned node can be in any state and the caller is + * fully responsible for determining what's accessible. + */ +struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) +{ + if (dentry->d_op == &kernfs_dops) + return dentry->d_fsdata; + return NULL; +} + static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, const char *name, umode_t mode, unsigned flags) @@ -348,11 +530,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, kn->ino = ret; atomic_set(&kn->count, 1); - atomic_set(&kn->active, 0); + atomic_set(&kn->active, KN_DEACTIVATED_BIAS); + RB_CLEAR_NODE(&kn->rb); kn->name = name; kn->mode = mode; - kn->flags = flags | KERNFS_REMOVED; + kn->flags = flags; return kn; @@ -378,69 +561,44 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, } /** - * kernfs_addrm_start - prepare for kernfs_node add/remove - * @acxt: pointer to kernfs_addrm_cxt to be used - * - * This function is called when the caller is about to add or remove - * kernfs_node. This function acquires kernfs_mutex. @acxt is used - * to keep and pass context to other addrm functions. - * - * LOCKING: - * Kernel thread context (may sleep). kernfs_mutex is locked on - * return. - */ -void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt) - __acquires(kernfs_mutex) -{ - memset(acxt, 0, sizeof(*acxt)); - - mutex_lock(&kernfs_mutex); -} - -/** * kernfs_add_one - add kernfs_node to parent without warning - * @acxt: addrm context to use * @kn: kernfs_node to be added * * The caller must already have initialized @kn->parent. This * function increments nlink of the parent's inode if @kn is a * directory and link into the children list of the parent. * - * This function should be called between calls to - * kernfs_addrm_start() and kernfs_addrm_finish() and should be passed - * the same @acxt as passed to kernfs_addrm_start(). - * - * LOCKING: - * Determined by kernfs_addrm_start(). - * * RETURNS: * 0 on success, -EEXIST if entry with the given name already * exists. */ -int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn) +int kernfs_add_one(struct kernfs_node *kn) { struct kernfs_node *parent = kn->parent; - bool has_ns = kernfs_ns_enabled(parent); struct kernfs_iattrs *ps_iattr; + bool has_ns; int ret; - if (has_ns != (bool)kn->ns) { - WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", - has_ns ? "required" : "invalid", parent->name, kn->name); - return -EINVAL; - } + mutex_lock(&kernfs_mutex); + + ret = -EINVAL; + has_ns = kernfs_ns_enabled(parent); + if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", + has_ns ? "required" : "invalid", parent->name, kn->name)) + goto out_unlock; if (kernfs_type(parent) != KERNFS_DIR) - return -EINVAL; + goto out_unlock; - if (parent->flags & KERNFS_REMOVED) - return -ENOENT; + ret = -ENOENT; + if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent)) + goto out_unlock; kn->hash = kernfs_name_hash(kn->name, kn->ns); ret = kernfs_link_sibling(kn); if (ret) - return ret; + goto out_unlock; /* Update timestamps on the parent */ ps_iattr = parent->iattr; @@ -449,82 +607,22 @@ int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn) ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; } - /* Mark the entry added into directory tree */ - kn->flags &= ~KERNFS_REMOVED; - - return 0; -} - -/** - * kernfs_remove_one - remove kernfs_node from parent - * @acxt: addrm context to use - * @kn: kernfs_node to be removed - * - * Mark @kn removed and drop nlink of parent inode if @kn is a - * directory. @kn is unlinked from the children list. - * - * This function should be called between calls to - * kernfs_addrm_start() and kernfs_addrm_finish() and should be - * passed the same @acxt as passed to kernfs_addrm_start(). - * - * LOCKING: - * Determined by kernfs_addrm_start(). - */ -static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt, - struct kernfs_node *kn) -{ - struct kernfs_iattrs *ps_iattr; + mutex_unlock(&kernfs_mutex); /* - * Removal can be called multiple times on the same node. Only the - * first invocation is effective and puts the base ref. + * Activate the new node unless CREATE_DEACTIVATED is requested. + * If not activated here, the kernfs user is responsible for + * activating the node with kernfs_activate(). A node which hasn't + * been activated is not visible to userland and its removal won't + * trigger deactivation. */ - if (kn->flags & KERNFS_REMOVED) - return; - - if (kn->parent) { - kernfs_unlink_sibling(kn); - - /* Update timestamps on the parent */ - ps_iattr = kn->parent->iattr; - if (ps_iattr) { - ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME; - ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME; - } - } - - kn->flags |= KERNFS_REMOVED; - kn->u.removed_list = acxt->removed; - acxt->removed = kn; -} + if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) + kernfs_activate(kn); + return 0; -/** - * kernfs_addrm_finish - finish up kernfs_node add/remove - * @acxt: addrm context to finish up - * - * Finish up kernfs_node add/remove. Resources acquired by - * kernfs_addrm_start() are released and removed kernfs_nodes are - * cleaned up. - * - * LOCKING: - * kernfs_mutex is released. - */ -void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt) - __releases(kernfs_mutex) -{ - /* release resources acquired by kernfs_addrm_start() */ +out_unlock: mutex_unlock(&kernfs_mutex); - - /* kill removed kernfs_nodes */ - while (acxt->removed) { - struct kernfs_node *kn = acxt->removed; - - acxt->removed = kn->u.removed_list; - - kernfs_deactivate(kn); - kernfs_unmap_bin_file(kn); - kernfs_put(kn); - } + return ret; } /** @@ -595,13 +693,15 @@ EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); /** * kernfs_create_root - create a new kernfs hierarchy - * @kdops: optional directory syscall operations for the hierarchy + * @scops: optional syscall operations for the hierarchy + * @flags: KERNFS_ROOT_* flags * @priv: opaque data associated with the new directory * * Returns the root of the new hierarchy on success, ERR_PTR() value on * failure. */ -struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv) +struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, + unsigned int flags, void *priv) { struct kernfs_root *root; struct kernfs_node *kn; @@ -620,12 +720,16 @@ struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv) return ERR_PTR(-ENOMEM); } - kn->flags &= ~KERNFS_REMOVED; kn->priv = priv; kn->dir.root = root; - root->dir_ops = kdops; + root->syscall_ops = scops; + root->flags = flags; root->kn = kn; + init_waitqueue_head(&root->deactivate_waitq); + + if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) + kernfs_activate(kn); return root; } @@ -656,7 +760,6 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, void *priv, const void *ns) { - struct kernfs_addrm_cxt acxt; struct kernfs_node *kn; int rc; @@ -670,10 +773,7 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, kn->priv = priv; /* link in */ - kernfs_addrm_start(&acxt); - rc = kernfs_add_one(&acxt, kn); - kernfs_addrm_finish(&acxt); - + rc = kernfs_add_one(kn); if (!rc) return kn; @@ -699,7 +799,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, kn = kernfs_find_ns(parent, dentry->d_name.name, ns); /* no such entry */ - if (!kn) { + if (!kn || !kernfs_active(kn)) { ret = NULL; goto out_unlock; } @@ -724,23 +824,37 @@ static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct kernfs_node *parent = dir->i_private; - struct kernfs_dir_ops *kdops = kernfs_root(parent)->dir_ops; + struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; + int ret; - if (!kdops || !kdops->mkdir) + if (!scops || !scops->mkdir) return -EPERM; - return kdops->mkdir(parent, dentry->d_name.name, mode); + if (!kernfs_get_active(parent)) + return -ENODEV; + + ret = scops->mkdir(parent, dentry->d_name.name, mode); + + kernfs_put_active(parent); + return ret; } static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) { struct kernfs_node *kn = dentry->d_fsdata; - struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; + struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; + int ret; - if (!kdops || !kdops->rmdir) + if (!scops || !scops->rmdir) return -EPERM; - return kdops->rmdir(kn); + if (!kernfs_get_active(kn)) + return -ENODEV; + + ret = scops->rmdir(kn); + + kernfs_put_active(kn); + return ret; } static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, @@ -748,12 +862,25 @@ static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, { struct kernfs_node *kn = old_dentry->d_fsdata; struct kernfs_node *new_parent = new_dir->i_private; - struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops; + struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; + int ret; - if (!kdops || !kdops->rename) + if (!scops || !scops->rename) return -EPERM; - return kdops->rename(kn, new_parent, new_dentry->d_name.name); + if (!kernfs_get_active(kn)) + return -ENODEV; + + if (!kernfs_get_active(new_parent)) { + kernfs_put_active(kn); + return -ENODEV; + } + + ret = scops->rename(kn, new_parent, new_dentry->d_name.name); + + kernfs_put_active(new_parent); + kernfs_put_active(kn); + return ret; } const struct inode_operations kernfs_dir_iops = { @@ -826,23 +953,104 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, return pos->parent; } -static void __kernfs_remove(struct kernfs_addrm_cxt *acxt, - struct kernfs_node *kn) +/** + * kernfs_activate - activate a node which started deactivated + * @kn: kernfs_node whose subtree is to be activated + * + * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node + * needs to be explicitly activated. A node which hasn't been activated + * isn't visible to userland and deactivation is skipped during its + * removal. This is useful to construct atomic init sequences where + * creation of multiple nodes should either succeed or fail atomically. + * + * The caller is responsible for ensuring that this function is not called + * after kernfs_remove*() is invoked on @kn. + */ +void kernfs_activate(struct kernfs_node *kn) { - struct kernfs_node *pos, *next; + struct kernfs_node *pos; - if (!kn) + mutex_lock(&kernfs_mutex); + + pos = NULL; + while ((pos = kernfs_next_descendant_post(pos, kn))) { + if (!pos || (pos->flags & KERNFS_ACTIVATED)) + continue; + + WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb)); + WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS); + + atomic_sub(KN_DEACTIVATED_BIAS, &pos->active); + pos->flags |= KERNFS_ACTIVATED; + } + + mutex_unlock(&kernfs_mutex); +} + +static void __kernfs_remove(struct kernfs_node *kn) +{ + struct kernfs_node *pos; + + lockdep_assert_held(&kernfs_mutex); + + /* + * Short-circuit if non-root @kn has already finished removal. + * This is for kernfs_remove_self() which plays with active ref + * after removal. + */ + if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb))) return; pr_debug("kernfs %s: removing\n", kn->name); - next = NULL; + /* prevent any new usage under @kn by deactivating all nodes */ + pos = NULL; + while ((pos = kernfs_next_descendant_post(pos, kn))) + if (kernfs_active(pos)) + atomic_add(KN_DEACTIVATED_BIAS, &pos->active); + + /* deactivate and unlink the subtree node-by-node */ do { - pos = next; - next = kernfs_next_descendant_post(pos, kn); - if (pos) - kernfs_remove_one(acxt, pos); - } while (next); + pos = kernfs_leftmost_descendant(kn); + + /* + * kernfs_drain() drops kernfs_mutex temporarily and @pos's + * base ref could have been put by someone else by the time + * the function returns. Make sure it doesn't go away + * underneath us. + */ + kernfs_get(pos); + + /* + * Drain iff @kn was activated. This avoids draining and + * its lockdep annotations for nodes which have never been + * activated and allows embedding kernfs_remove() in create + * error paths without worrying about draining. + */ + if (kn->flags & KERNFS_ACTIVATED) + kernfs_drain(pos); + else + WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); + + /* + * kernfs_unlink_sibling() succeeds once per node. Use it + * to decide who's responsible for cleanups. + */ + if (!pos->parent || kernfs_unlink_sibling(pos)) { + struct kernfs_iattrs *ps_iattr = + pos->parent ? pos->parent->iattr : NULL; + + /* update timestamps on the parent */ + if (ps_iattr) { + ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME; + ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME; + } + + kernfs_put(pos); + } + + kernfs_put(pos); + } while (pos != kn); } /** @@ -853,11 +1061,140 @@ static void __kernfs_remove(struct kernfs_addrm_cxt *acxt, */ void kernfs_remove(struct kernfs_node *kn) { - struct kernfs_addrm_cxt acxt; + mutex_lock(&kernfs_mutex); + __kernfs_remove(kn); + mutex_unlock(&kernfs_mutex); +} - kernfs_addrm_start(&acxt); - __kernfs_remove(&acxt, kn); - kernfs_addrm_finish(&acxt); +/** + * kernfs_break_active_protection - break out of active protection + * @kn: the self kernfs_node + * + * The caller must be running off of a kernfs operation which is invoked + * with an active reference - e.g. one of kernfs_ops. Each invocation of + * this function must also be matched with an invocation of + * kernfs_unbreak_active_protection(). + * + * This function releases the active reference of @kn the caller is + * holding. Once this function is called, @kn may be removed at any point + * and the caller is solely responsible for ensuring that the objects it + * dereferences are accessible. + */ +void kernfs_break_active_protection(struct kernfs_node *kn) +{ + /* + * Take out ourself out of the active ref dependency chain. If + * we're called without an active ref, lockdep will complain. + */ + kernfs_put_active(kn); +} + +/** + * kernfs_unbreak_active_protection - undo kernfs_break_active_protection() + * @kn: the self kernfs_node + * + * If kernfs_break_active_protection() was called, this function must be + * invoked before finishing the kernfs operation. Note that while this + * function restores the active reference, it doesn't and can't actually + * restore the active protection - @kn may already or be in the process of + * being removed. Once kernfs_break_active_protection() is invoked, that + * protection is irreversibly gone for the kernfs operation instance. + * + * While this function may be called at any point after + * kernfs_break_active_protection() is invoked, its most useful location + * would be right before the enclosing kernfs operation returns. + */ +void kernfs_unbreak_active_protection(struct kernfs_node *kn) +{ + /* + * @kn->active could be in any state; however, the increment we do + * here will be undone as soon as the enclosing kernfs operation + * finishes and this temporary bump can't break anything. If @kn + * is alive, nothing changes. If @kn is being deactivated, the + * soon-to-follow put will either finish deactivation or restore + * deactivated state. If @kn is already removed, the temporary + * bump is guaranteed to be gone before @kn is released. + */ + atomic_inc(&kn->active); + if (kernfs_lockdep(kn)) + rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_); +} + +/** + * kernfs_remove_self - remove a kernfs_node from its own method + * @kn: the self kernfs_node to remove + * + * The caller must be running off of a kernfs operation which is invoked + * with an active reference - e.g. one of kernfs_ops. This can be used to + * implement a file operation which deletes itself. + * + * For example, the "delete" file for a sysfs device directory can be + * implemented by invoking kernfs_remove_self() on the "delete" file + * itself. This function breaks the circular dependency of trying to + * deactivate self while holding an active ref itself. It isn't necessary + * to modify the usual removal path to use kernfs_remove_self(). The + * "delete" implementation can simply invoke kernfs_remove_self() on self + * before proceeding with the usual removal path. kernfs will ignore later + * kernfs_remove() on self. + * + * kernfs_remove_self() can be called multiple times concurrently on the + * same kernfs_node. Only the first one actually performs removal and + * returns %true. All others will wait until the kernfs operation which + * won self-removal finishes and return %false. Note that the losers wait + * for the completion of not only the winning kernfs_remove_self() but also + * the whole kernfs_ops which won the arbitration. This can be used to + * guarantee, for example, all concurrent writes to a "delete" file to + * finish only after the whole operation is complete. + */ +bool kernfs_remove_self(struct kernfs_node *kn) +{ + bool ret; + + mutex_lock(&kernfs_mutex); + kernfs_break_active_protection(kn); + + /* + * SUICIDAL is used to arbitrate among competing invocations. Only + * the first one will actually perform removal. When the removal + * is complete, SUICIDED is set and the active ref is restored + * while holding kernfs_mutex. The ones which lost arbitration + * waits for SUICDED && drained which can happen only after the + * enclosing kernfs operation which executed the winning instance + * of kernfs_remove_self() finished. + */ + if (!(kn->flags & KERNFS_SUICIDAL)) { + kn->flags |= KERNFS_SUICIDAL; + __kernfs_remove(kn); + kn->flags |= KERNFS_SUICIDED; + ret = true; + } else { + wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq; + DEFINE_WAIT(wait); + + while (true) { + prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE); + + if ((kn->flags & KERNFS_SUICIDED) && + atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) + break; + + mutex_unlock(&kernfs_mutex); + schedule(); + mutex_lock(&kernfs_mutex); + } + finish_wait(waitq, &wait); + WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); + ret = false; + } + + /* + * This must be done while holding kernfs_mutex; otherwise, waiting + * for SUICIDED && deactivated could finish prematurely. + */ + kernfs_unbreak_active_protection(kn); + + mutex_unlock(&kernfs_mutex); + return ret; } /** @@ -872,7 +1209,6 @@ void kernfs_remove(struct kernfs_node *kn) int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns) { - struct kernfs_addrm_cxt acxt; struct kernfs_node *kn; if (!parent) { @@ -881,13 +1217,13 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, return -ENOENT; } - kernfs_addrm_start(&acxt); + mutex_lock(&kernfs_mutex); kn = kernfs_find_ns(parent, name, ns); if (kn) - __kernfs_remove(&acxt, kn); + __kernfs_remove(kn); - kernfs_addrm_finish(&acxt); + mutex_unlock(&kernfs_mutex); if (kn) return 0; @@ -905,12 +1241,18 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { + struct kernfs_node *old_parent; + const char *old_name = NULL; int error; + /* can't move or rename root */ + if (!kn->parent) + return -EINVAL; + mutex_lock(&kernfs_mutex); error = -ENOENT; - if ((kn->flags | new_parent->flags) & KERNFS_REMOVED) + if (!kernfs_active(kn) || !kernfs_active(new_parent)) goto out; error = 0; @@ -928,13 +1270,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, new_name = kstrdup(new_name, GFP_KERNEL); if (!new_name) goto out; - - if (kn->flags & KERNFS_STATIC_NAME) - kn->flags &= ~KERNFS_STATIC_NAME; - else - kfree(kn->name); - - kn->name = new_name; + } else { + new_name = NULL; } /* @@ -942,12 +1279,29 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, */ kernfs_unlink_sibling(kn); kernfs_get(new_parent); - kernfs_put(kn->parent); - kn->ns = new_ns; - kn->hash = kernfs_name_hash(kn->name, kn->ns); + + /* rename_lock protects ->parent and ->name accessors */ + spin_lock_irq(&kernfs_rename_lock); + + old_parent = kn->parent; kn->parent = new_parent; + + kn->ns = new_ns; + if (new_name) { + if (!(kn->flags & KERNFS_STATIC_NAME)) + old_name = kn->name; + kn->flags &= ~KERNFS_STATIC_NAME; + kn->name = new_name; + } + + spin_unlock_irq(&kernfs_rename_lock); + + kn->hash = kernfs_name_hash(new_name, new_ns); kernfs_link_sibling(kn); + kernfs_put(old_parent); + kfree(old_name); + error = 0; out: mutex_unlock(&kernfs_mutex); @@ -970,7 +1324,7 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns, struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) { if (pos) { - int valid = !(pos->flags & KERNFS_REMOVED) && + int valid = kernfs_active(pos) && pos->parent == parent && hash == pos->hash; kernfs_put(pos); if (!valid) @@ -989,8 +1343,8 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns, break; } } - /* Skip over entries in the wrong namespace */ - while (pos && pos->ns != ns) { + /* Skip over entries which are dying/dead or in the wrong namespace */ + while (pos && (!kernfs_active(pos) || pos->ns != ns)) { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; @@ -1004,14 +1358,15 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns, struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) { pos = kernfs_dir_pos(ns, parent, ino, pos); - if (pos) + if (pos) { do { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); - } while (pos && pos->ns != ns); + } while (pos && (!kernfs_active(pos) || pos->ns != ns)); + } return pos; } diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index dbf397bfdff..ddcb471b9cc 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -252,19 +252,9 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct kernfs_open_file *of = kernfs_of(file); - ssize_t len = min_t(size_t, count, PAGE_SIZE); const struct kernfs_ops *ops; - char *buf; - - buf = kmalloc(len + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, user_buf, len)) { - len = -EFAULT; - goto out_free; - } - buf[len] = '\0'; /* guarantee string termination */ + char *buf = NULL; + ssize_t len; /* * @of->mutex nests outside active ref and is just to ensure that @@ -273,22 +263,45 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { mutex_unlock(&of->mutex); - len = -ENODEV; - goto out_free; + return -ENODEV; } ops = kernfs_ops(of->kn); - if (ops->write) - len = ops->write(of, buf, len, *ppos); - else + if (!ops->write) { len = -EINVAL; + goto out_unlock; + } + if (ops->atomic_write_len) { + len = count; + if (len > ops->atomic_write_len) { + len = -E2BIG; + goto out_unlock; + } + } else { + len = min_t(size_t, count, PAGE_SIZE); + } + + buf = kmalloc(len + 1, GFP_KERNEL); + if (!buf) { + len = -ENOMEM; + goto out_unlock; + } + + if (copy_from_user(buf, user_buf, len)) { + len = -EFAULT; + goto out_unlock; + } + buf[len] = '\0'; /* guarantee string termination */ + + len = ops->write(of, buf, len, *ppos); +out_unlock: kernfs_put_active(of->kn); mutex_unlock(&of->mutex); if (len > 0) *ppos += len; -out_free: + kfree(buf); return len; } @@ -820,7 +833,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, bool name_is_static, struct lock_class_key *key) { - struct kernfs_addrm_cxt acxt; struct kernfs_node *kn; unsigned flags; int rc; @@ -855,10 +867,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, if (ops->mmap) kn->flags |= KERNFS_HAS_MMAP; - kernfs_addrm_start(&acxt); - rc = kernfs_add_one(&acxt, kn); - kernfs_addrm_finish(&acxt); - + rc = kernfs_add_one(kn); if (rc) { kernfs_put(kn); return ERR_PTR(rc); diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index eb536b76374..a91d7a1113d 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -26,7 +26,8 @@ struct kernfs_iattrs { struct simple_xattrs xattrs; }; -#define KN_DEACTIVATED_BIAS INT_MIN +/* +1 to avoid triggering overflow warning when negating it */ +#define KN_DEACTIVATED_BIAS (INT_MIN + 1) /* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ @@ -45,13 +46,6 @@ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) } /* - * Context structure to be used while adding/removing nodes. - */ -struct kernfs_addrm_cxt { - struct kernfs_node *removed; -}; - -/* * mount.c */ struct kernfs_super_info { @@ -100,9 +94,7 @@ extern const struct inode_operations kernfs_dir_iops; struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); void kernfs_put_active(struct kernfs_node *kn); -void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt); -int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn); -void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt); +int kernfs_add_one(struct kernfs_node *kn); struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, unsigned flags); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 0d6ce895a9e..e5b28b0ebc3 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -19,12 +19,49 @@ struct kmem_cache *kernfs_node_cache; +static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct kernfs_root *root = kernfs_info(sb)->root; + struct kernfs_syscall_ops *scops = root->syscall_ops; + + if (scops && scops->remount_fs) + return scops->remount_fs(root, flags, data); + return 0; +} + +static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) +{ + struct kernfs_root *root = kernfs_root(dentry->d_fsdata); + struct kernfs_syscall_ops *scops = root->syscall_ops; + + if (scops && scops->show_options) + return scops->show_options(sf, root); + return 0; +} + static const struct super_operations kernfs_sops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .evict_inode = kernfs_evict_inode, + + .remount_fs = kernfs_sop_remount_fs, + .show_options = kernfs_sop_show_options, }; +/** + * kernfs_root_from_sb - determine kernfs_root associated with a super_block + * @sb: the super_block in question + * + * Return the kernfs_root associated with @sb. If @sb is not a kernfs one, + * %NULL is returned. + */ +struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) +{ + if (sb->s_op == &kernfs_sops) + return kernfs_info(sb)->root; + return NULL; +} + static int kernfs_fill_super(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 4d457055acb..8a198898e39 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -27,7 +27,6 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, struct kernfs_node *target) { struct kernfs_node *kn; - struct kernfs_addrm_cxt acxt; int error; kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK); @@ -39,10 +38,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, kn->symlink.target_kn = target; kernfs_get(target); /* ref owned by symlink */ - kernfs_addrm_start(&acxt); - error = kernfs_add_one(&acxt, kn); - kernfs_addrm_finish(&acxt); - + error = kernfs_add_one(kn); if (!error) return kn; diff --git a/fs/sysfs/Kconfig b/fs/sysfs/Kconfig index 8c41feacbac..b2756014508 100644 --- a/fs/sysfs/Kconfig +++ b/fs/sysfs/Kconfig @@ -1,6 +1,7 @@ config SYSFS bool "sysfs file system support" if EXPERT default y + select KERNFS help The sysfs filesystem is a virtual filesystem that the kernel uses to export internal kernel objects, their attributes, and their diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index ee0d761c317..0b45ff42f37 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -19,39 +19,18 @@ DEFINE_SPINLOCK(sysfs_symlink_target_lock); -/** - * sysfs_pathname - return full path to sysfs dirent - * @kn: kernfs_node whose path we want - * @path: caller allocated buffer of size PATH_MAX - * - * Gives the name "/" to the sysfs_root entry; any path returned - * is relative to wherever sysfs is mounted. - */ -static char *sysfs_pathname(struct kernfs_node *kn, char *path) -{ - if (kn->parent) { - sysfs_pathname(kn->parent, path); - strlcat(path, "/", PATH_MAX); - } - strlcat(path, kn->name, PATH_MAX); - return path; -} - void sysfs_warn_dup(struct kernfs_node *parent, const char *name) { - char *path; + char *buf, *path = NULL; - path = kzalloc(PATH_MAX, GFP_KERNEL); - if (path) { - sysfs_pathname(parent, path); - strlcat(path, "/", PATH_MAX); - strlcat(path, name, PATH_MAX); - } + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (buf) + path = kernfs_path(parent, buf, PATH_MAX); - WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s'\n", - path ? path : name); + WARN(1, KERN_WARNING "sysfs: cannot create duplicate filename '%s/%s'\n", + path, name); - kfree(path); + kfree(buf); } /** @@ -122,9 +101,13 @@ void sysfs_remove_dir(struct kobject *kobj) int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { - struct kernfs_node *parent = kobj->sd->parent; + struct kernfs_node *parent; + int ret; - return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); + parent = kernfs_get_parent(kobj->sd); + ret = kernfs_rename_ns(kobj->sd, parent, new_name, new_ns); + kernfs_put(parent); + return ret; } int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, @@ -133,7 +116,6 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, struct kernfs_node *kn = kobj->sd; struct kernfs_node *new_parent; - BUG_ON(!kn->parent); new_parent = new_parent_kobj && new_parent_kobj->sd ? new_parent_kobj->sd : sysfs_root_kn; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 810cf6e613e..28cc1acd543 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -372,6 +372,29 @@ void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, } EXPORT_SYMBOL_GPL(sysfs_remove_file_ns); +/** + * sysfs_remove_file_self - remove an object attribute from its own method + * @kobj: object we're acting for + * @attr: attribute descriptor + * + * See kernfs_remove_self() for details. + */ +bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) +{ + struct kernfs_node *parent = kobj->sd; + struct kernfs_node *kn; + bool ret; + + kn = kernfs_find_and_get(parent, attr->name); + if (WARN_ON_ONCE(!kn)) + return false; + + ret = kernfs_remove_self(kn); + + kernfs_put(kn); + return ret; +} + void sysfs_remove_files(struct kobject *kobj, const struct attribute **ptr) { int i; @@ -430,95 +453,3 @@ void sysfs_remove_bin_file(struct kobject *kobj, kernfs_remove_by_name(kobj->sd, attr->attr.name); } EXPORT_SYMBOL_GPL(sysfs_remove_bin_file); - -struct sysfs_schedule_callback_struct { - struct list_head workq_list; - struct kobject *kobj; - void (*func)(void *); - void *data; - struct module *owner; - struct work_struct work; -}; - -static struct workqueue_struct *sysfs_workqueue; -static DEFINE_MUTEX(sysfs_workq_mutex); -static LIST_HEAD(sysfs_workq); -static void sysfs_schedule_callback_work(struct work_struct *work) -{ - struct sysfs_schedule_callback_struct *ss = container_of(work, - struct sysfs_schedule_callback_struct, work); - - (ss->func)(ss->data); - kobject_put(ss->kobj); - module_put(ss->owner); - mutex_lock(&sysfs_workq_mutex); - list_del(&ss->workq_list); - mutex_unlock(&sysfs_workq_mutex); - kfree(ss); -} - -/** - * sysfs_schedule_callback - helper to schedule a callback for a kobject - * @kobj: object we're acting for. - * @func: callback function to invoke later. - * @data: argument to pass to @func. - * @owner: module owning the callback code - * - * sysfs attribute methods must not unregister themselves or their parent - * kobject (which would amount to the same thing). Attempts to do so will - * deadlock, since unregistration is mutually exclusive with driver - * callbacks. - * - * Instead methods can call this routine, which will attempt to allocate - * and schedule a workqueue request to call back @func with @data as its - * argument in the workqueue's process context. @kobj will be pinned - * until @func returns. - * - * Returns 0 if the request was submitted, -ENOMEM if storage could not - * be allocated, -ENODEV if a reference to @owner isn't available, - * -EAGAIN if a callback has already been scheduled for @kobj. - */ -int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), - void *data, struct module *owner) -{ - struct sysfs_schedule_callback_struct *ss, *tmp; - - if (!try_module_get(owner)) - return -ENODEV; - - mutex_lock(&sysfs_workq_mutex); - list_for_each_entry_safe(ss, tmp, &sysfs_workq, workq_list) - if (ss->kobj == kobj) { - module_put(owner); - mutex_unlock(&sysfs_workq_mutex); - return -EAGAIN; - } - mutex_unlock(&sysfs_workq_mutex); - - if (sysfs_workqueue == NULL) { - sysfs_workqueue = create_singlethread_workqueue("sysfsd"); - if (sysfs_workqueue == NULL) { - module_put(owner); - return -ENOMEM; - } - } - - ss = kmalloc(sizeof(*ss), GFP_KERNEL); - if (!ss) { - module_put(owner); - return -ENOMEM; - } - kobject_get(kobj); - ss->kobj = kobj; - ss->func = func; - ss->data = data; - ss->owner = owner; - INIT_WORK(&ss->work, sysfs_schedule_callback_work); - INIT_LIST_HEAD(&ss->workq_list); - mutex_lock(&sysfs_workq_mutex); - list_add_tail(&ss->workq_list, &sysfs_workq); - mutex_unlock(&sysfs_workq_mutex); - queue_work(sysfs_workqueue, &ss->work); - return 0; -} -EXPORT_SYMBOL_GPL(sysfs_schedule_callback); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 6211230814f..5c7fdd9c681 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -62,7 +62,7 @@ int __init sysfs_init(void) { int err; - sysfs_root = kernfs_create_root(NULL, NULL); + sysfs_root = kernfs_create_root(NULL, 0, NULL); if (IS_ERR(sysfs_root)) return PTR_ERR(sysfs_root); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9450f025fe0..8c283a910b9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -14,14 +14,13 @@ #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/cgroupstats.h> -#include <linux/prio_heap.h> #include <linux/rwsem.h> #include <linux/idr.h> #include <linux/workqueue.h> -#include <linux/xattr.h> #include <linux/fs.h> #include <linux/percpu-refcount.h> #include <linux/seq_file.h> +#include <linux/kernfs.h> #ifdef CONFIG_CGROUPS @@ -37,28 +36,13 @@ extern void cgroup_post_fork(struct task_struct *p); extern void cgroup_exit(struct task_struct *p, int run_callbacks); extern int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); -extern int cgroup_load_subsys(struct cgroup_subsys *ss); -extern void cgroup_unload_subsys(struct cgroup_subsys *ss); extern int proc_cgroup_show(struct seq_file *, void *); -/* - * Define the enumeration of all cgroup subsystems. - * - * We define ids for builtin subsystems and then modular ones. - */ -#define SUBSYS(_x) _x ## _subsys_id, +/* define the enumeration of all cgroup subsystems */ +#define SUBSYS(_x) _x ## _cgrp_id, enum cgroup_subsys_id { -#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) #include <linux/cgroup_subsys.h> -#undef IS_SUBSYS_ENABLED - CGROUP_BUILTIN_SUBSYS_COUNT, - - __CGROUP_SUBSYS_TEMP_PLACEHOLDER = CGROUP_BUILTIN_SUBSYS_COUNT - 1, - -#define IS_SUBSYS_ENABLED(option) IS_MODULE(option) -#include <linux/cgroup_subsys.h> -#undef IS_SUBSYS_ENABLED CGROUP_SUBSYS_COUNT, }; #undef SUBSYS @@ -153,11 +137,6 @@ enum { CGRP_SANE_BEHAVIOR, }; -struct cgroup_name { - struct rcu_head rcu_head; - char name[]; -}; - struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ @@ -174,16 +153,17 @@ struct cgroup { /* the number of attached css's */ int nr_css; + atomic_t refcnt; + /* * We link our 'sibling' struct into our parent's 'children'. * Our children link their 'sibling' into our 'children'. */ struct list_head sibling; /* my parent's children */ struct list_head children; /* my children */ - struct list_head files; /* my files */ struct cgroup *parent; /* my parent */ - struct dentry *dentry; /* cgroup fs entry, RCU protected */ + struct kernfs_node *kn; /* cgroup kernfs entry */ /* * Monotonically increasing unique serial number which defines a @@ -193,19 +173,6 @@ struct cgroup { */ u64 serial_nr; - /* - * This is a copy of dentry->d_name, and it's needed because - * we can't use dentry->d_name in cgroup_path(). - * - * You must acquire rcu_read_lock() to access cgrp->name, and - * the only place that can change it is rename(), which is - * protected by parent dir's i_mutex. - * - * Normally you should use cgroup_name() wrapper rather than - * access it directly. - */ - struct cgroup_name __rcu *name; - /* Private pointers for each registered subsystem */ struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; @@ -237,9 +204,6 @@ struct cgroup { /* For css percpu_ref killing and RCU-protected deletion */ struct rcu_head rcu_head; struct work_struct destroy_work; - - /* directory xattrs */ - struct simple_xattrs xattrs; }; #define MAX_CGROUP_ROOT_NAMELEN 64 @@ -262,8 +226,8 @@ enum { * * The followings are the behaviors currently affected this flag. * - * - Mount options "noprefix" and "clone_children" are disallowed. - * Also, cgroupfs file cgroup.clone_children is not created. + * - Mount options "noprefix", "xattr", "clone_children", + * "release_agent" and "name" are disallowed. * * - When mounting an existing superblock, mount options should * match. @@ -281,6 +245,8 @@ enum { * - "release_agent" and "notify_on_release" are removed. * Replacement notification mechanism will be implemented. * + * - "cgroup.clone_children" is removed. + * * - cpuset: tasks will be kept in empty cpusets when hotplug happens * and take masks of ancestors with non-empty cpus/mems, instead of * being moved to an ancestor. @@ -300,17 +266,15 @@ enum { /* mount options live below bit 16 */ CGRP_ROOT_OPTION_MASK = (1 << 16) - 1, - - CGRP_ROOT_SUBSYS_BOUND = (1 << 16), /* subsystems finished binding */ }; /* * A cgroupfs_root represents the root of a cgroup hierarchy, and may be - * associated with a superblock to form an active hierarchy. This is + * associated with a kernfs_root to form an active hierarchy. This is * internal to cgroup core. Don't access directly from controllers. */ struct cgroupfs_root { - struct super_block *sb; + struct kernfs_root *kf_root; /* The bitmask of subsystems attached to this hierarchy */ unsigned long subsys_mask; @@ -318,11 +282,11 @@ struct cgroupfs_root { /* Unique id for this hierarchy. */ int hierarchy_id; - /* The root cgroup for this hierarchy */ + /* The root cgroup. Root is destroyed on its release. */ struct cgroup top_cgroup; - /* Tracks how many cgroups are currently defined in hierarchy.*/ - int number_of_cgroups; + /* Number of cgroups in the hierarchy, used only for /proc/cgroups */ + atomic_t nr_cgrps; /* A list running through the active hierarchies */ struct list_head root_list; @@ -372,10 +336,9 @@ struct css_set { struct list_head cgrp_links; /* - * Set of subsystem states, one for each subsystem. This array - * is immutable after creation apart from the init_css_set - * during subsystem registration (at boot time) and modular subsystem - * loading/unloading. + * Set of subsystem states, one for each subsystem. This array is + * immutable after creation apart from the init_css_set during + * subsystem registration (at boot time). */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; @@ -416,8 +379,9 @@ struct cftype { umode_t mode; /* - * If non-zero, defines the maximum length of string that can - * be passed to write_string; defaults to 64 + * The maximum length of string, excluding trailing nul, that can + * be passed to write_string. If < PAGE_SIZE-1, PAGE_SIZE-1 is + * assumed. */ size_t max_write_len; @@ -425,10 +389,12 @@ struct cftype { unsigned int flags; /* - * The subsys this file belongs to. Initialized automatically - * during registration. NULL for cgroup core files. + * Fields used for internal bookkeeping. Initialized automatically + * during registration. */ - struct cgroup_subsys *ss; + struct cgroup_subsys *ss; /* NULL for cgroup core files */ + struct list_head node; /* anchored at ss->cfts */ + struct kernfs_ops *kf_ops; /* * read_u64() is a shortcut for the common case of returning a @@ -475,36 +441,10 @@ struct cftype { * kick type for multiplexing. */ int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); -}; - -/* - * cftype_sets describe cftypes belonging to a subsystem and are chained at - * cgroup_subsys->cftsets. Each cftset points to an array of cftypes - * terminated by zero length name. - */ -struct cftype_set { - struct list_head node; /* chained at subsys->cftsets */ - struct cftype *cfts; -}; - -/* - * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. Don't - * access directly. - */ -struct cfent { - struct list_head node; - struct dentry *dentry; - struct cftype *type; - struct cgroup_subsys_state *css; - - /* file xattrs */ - struct simple_xattrs xattrs; -}; -/* seq_file->private points to the following, only ->priv is public */ -struct cgroup_open_file { - struct cfent *cfe; - void *priv; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lock_class_key lockdep_key; +#endif }; /* @@ -516,34 +456,79 @@ static inline bool cgroup_sane_behavior(const struct cgroup *cgrp) return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR; } -/* Caller should hold rcu_read_lock() */ -static inline const char *cgroup_name(const struct cgroup *cgrp) +/* no synchronization, the result can only be used as a hint */ +static inline bool cgroup_has_tasks(struct cgroup *cgrp) { - return rcu_dereference(cgrp->name)->name; + return !list_empty(&cgrp->cset_links); } -static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) +/* returns ino associated with a cgroup, 0 indicates unmounted root */ +static inline ino_t cgroup_ino(struct cgroup *cgrp) { - struct cgroup_open_file *of = seq->private; - return of->cfe->css; + if (cgrp->kn) + return cgrp->kn->ino; + else + return 0; } static inline struct cftype *seq_cft(struct seq_file *seq) { - struct cgroup_open_file *of = seq->private; - return of->cfe->type; + struct kernfs_open_file *of = seq->private; + + return of->kn->priv; +} + +struct cgroup_subsys_state *seq_css(struct seq_file *seq); + +/* + * Name / path handling functions. All are thin wrappers around the kernfs + * counterparts and can be called under any context. + */ + +static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) +{ + /* dummy_top doesn't have a kn associated */ + if (cgrp->kn) + return kernfs_name(cgrp->kn, buf, buflen); + else + return strlcpy(buf, "/", buflen); } +static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf, + size_t buflen) +{ + /* dummy_top doesn't have a kn associated */ + if (cgrp->kn) + return kernfs_path(cgrp->kn, buf, buflen); + strlcpy(buf, "/", buflen); + return (buflen <= 2) ? NULL : buf; +} + +static inline void pr_cont_cgroup_name(struct cgroup *cgrp) +{ + /* dummy_top doesn't have a kn associated */ + if (cgrp->kn) + pr_cont_kernfs_name(cgrp->kn); + else + pr_cont("/"); +} + +static inline void pr_cont_cgroup_path(struct cgroup *cgrp) +{ + /* dummy_top doesn't have a kn associated */ + if (cgrp->kn) + pr_cont_kernfs_path(cgrp->kn); + else + pr_cont("/"); +} + +char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); + int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); -int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); -int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); - -int cgroup_task_count(const struct cgroup *cgrp); - /* * Control Group taskset, used to pass around set of tasks to cgroup_subsys * methods. @@ -551,22 +536,15 @@ int cgroup_task_count(const struct cgroup *cgrp); struct cgroup_taskset; struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); -struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, - int subsys_id); -int cgroup_taskset_size(struct cgroup_taskset *tset); /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor - * @skip_css: skip if task's css matches this, %NULL to iterate through all * @tset: taskset to iterate */ -#define cgroup_taskset_for_each(task, skip_css, tset) \ +#define cgroup_taskset_for_each(task, tset) \ for ((task) = cgroup_taskset_first((tset)); (task); \ - (task) = cgroup_taskset_next((tset))) \ - if (!(skip_css) || \ - cgroup_taskset_cur_css((tset), \ - (skip_css)->ss->subsys_id) != (skip_css)) + (task) = cgroup_taskset_next((tset))) /* * Control Group subsystem type. @@ -591,7 +569,6 @@ struct cgroup_subsys { struct task_struct *task); void (*bind)(struct cgroup_subsys_state *root_css); - int subsys_id; int disabled; int early_init; @@ -610,27 +587,26 @@ struct cgroup_subsys { bool broken_hierarchy; bool warned_broken_hierarchy; + /* the following two fields are initialized automtically during boot */ + int id; #define MAX_CGROUP_TYPE_NAMELEN 32 const char *name; /* link to parent, protected by cgroup_lock() */ struct cgroupfs_root *root; - /* list of cftype_sets */ - struct list_head cftsets; + /* + * List of cftypes. Each entry is the first entry of an array + * terminated by zero length name. + */ + struct list_head cfts; - /* base cftypes, automatically [de]registered with subsys itself */ + /* base cftypes, automatically registered with subsys itself */ struct cftype *base_cftypes; - struct cftype_set base_cftset; - - /* should be defined only by modular subsystems */ - struct module *module; }; -#define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; -#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) +#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include <linux/cgroup_subsys.h> -#undef IS_SUBSYS_ENABLED #undef SUBSYS /** @@ -837,16 +813,11 @@ void css_task_iter_start(struct cgroup_subsys_state *css, struct task_struct *css_task_iter_next(struct css_task_iter *it); void css_task_iter_end(struct css_task_iter *it); -int css_scan_tasks(struct cgroup_subsys_state *css, - bool (*test)(struct task_struct *, void *), - void (*process)(struct task_struct *, void *), - void *data, struct ptr_heap *heap); - int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); -struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss); +struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss); #else /* !CONFIG_CGROUPS */ diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 7b99d717411..768fe44e19f 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -3,51 +3,51 @@ * * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. */ -#if IS_SUBSYS_ENABLED(CONFIG_CPUSETS) +#if IS_ENABLED(CONFIG_CPUSETS) SUBSYS(cpuset) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEBUG) +#if IS_ENABLED(CONFIG_CGROUP_DEBUG) SUBSYS(debug) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_SCHED) -SUBSYS(cpu_cgroup) +#if IS_ENABLED(CONFIG_CGROUP_SCHED) +SUBSYS(cpu) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_CPUACCT) +#if IS_ENABLED(CONFIG_CGROUP_CPUACCT) SUBSYS(cpuacct) #endif -#if IS_SUBSYS_ENABLED(CONFIG_MEMCG) -SUBSYS(mem_cgroup) +#if IS_ENABLED(CONFIG_MEMCG) +SUBSYS(memory) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_DEVICE) +#if IS_ENABLED(CONFIG_CGROUP_DEVICE) SUBSYS(devices) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_FREEZER) +#if IS_ENABLED(CONFIG_CGROUP_FREEZER) SUBSYS(freezer) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_CLASSID) +#if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID) SUBSYS(net_cls) #endif -#if IS_SUBSYS_ENABLED(CONFIG_BLK_CGROUP) +#if IS_ENABLED(CONFIG_BLK_CGROUP) SUBSYS(blkio) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_PERF) -SUBSYS(perf) +#if IS_ENABLED(CONFIG_CGROUP_PERF) +SUBSYS(perf_event) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_NET_PRIO) +#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) SUBSYS(net_prio) #endif -#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_HUGETLB) +#if IS_ENABLED(CONFIG_CGROUP_HUGETLB) SUBSYS(hugetlb) #endif /* diff --git a/include/linux/device.h b/include/linux/device.h index 952b01033c3..fb1ba13f766 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -560,16 +560,12 @@ extern int device_create_file(struct device *device, const struct device_attribute *entry); extern void device_remove_file(struct device *dev, const struct device_attribute *attr); +extern bool device_remove_file_self(struct device *dev, + const struct device_attribute *attr); extern int __must_check device_create_bin_file(struct device *dev, const struct bin_attribute *attr); extern void device_remove_bin_file(struct device *dev, const struct bin_attribute *attr); -extern int device_schedule_callback_owner(struct device *dev, - void (*func)(struct device *dev), struct module *owner); - -/* This is a macro to avoid include problems with THIS_MODULE */ -#define device_schedule_callback(dev, func) \ - device_schedule_callback_owner(dev, func, THIS_MODULE) /* device resource management */ typedef void (*dr_release_t)(struct device *dev, void *res); @@ -929,10 +925,7 @@ extern int device_online(struct device *dev); extern struct device *__root_device_register(const char *name, struct module *owner); -/* - * This is a macro to avoid include problems with THIS_MODULE, - * just as per what is done for device_schedule_callback() above. - */ +/* This is a macro to avoid include problems with THIS_MODULE */ #define root_device_register(name) \ __root_device_register(name, THIS_MODULE) diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 787bba3bf55..0129f89cf98 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -49,7 +49,7 @@ int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) static inline bool hugetlb_cgroup_disabled(void) { - if (hugetlb_subsys.disabled) + if (hugetlb_cgrp_subsys.disabled) return true; return false; } diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 5be9f0228a3..649497a56a9 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -15,7 +15,7 @@ #include <linux/lockdep.h> #include <linux/rbtree.h> #include <linux/atomic.h> -#include <linux/completion.h> +#include <linux/wait.h> struct file; struct dentry; @@ -35,16 +35,22 @@ enum kernfs_node_type { }; #define KERNFS_TYPE_MASK 0x000f -#define KERNFS_ACTIVE_REF KERNFS_FILE #define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK enum kernfs_node_flag { - KERNFS_REMOVED = 0x0010, + KERNFS_ACTIVATED = 0x0010, KERNFS_NS = 0x0020, KERNFS_HAS_SEQ_SHOW = 0x0040, KERNFS_HAS_MMAP = 0x0080, KERNFS_LOCKDEP = 0x0100, KERNFS_STATIC_NAME = 0x0200, + KERNFS_SUICIDAL = 0x0400, + KERNFS_SUICIDED = 0x0800, +}; + +/* @flags for kernfs_create_root() */ +enum kernfs_root_flag { + KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001, }; /* type-specific structures for kernfs_node union members */ @@ -85,17 +91,17 @@ struct kernfs_node { #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif - /* the following two fields are published */ + /* + * Use kernfs_get_parent() and kernfs_name/path() instead of + * accessing the following two fields directly. If the node is + * never moved to a different parent, it is safe to access the + * parent directly. + */ struct kernfs_node *parent; const char *name; struct rb_node rb; - union { - struct completion *completion; - struct kernfs_node *removed_list; - } u; - const void *ns; /* namespace tag */ unsigned int hash; /* ns + name hash */ union { @@ -113,12 +119,16 @@ struct kernfs_node { }; /* - * kernfs_dir_ops may be specified on kernfs_create_root() to support - * directory manipulation syscalls. These optional callbacks are invoked - * on the matching syscalls and can perform any kernfs operations which - * don't necessarily have to be the exact operation requested. + * kernfs_syscall_ops may be specified on kernfs_create_root() to support + * syscalls. These optional callbacks are invoked on the matching syscalls + * and can perform any kernfs operations which don't necessarily have to be + * the exact operation requested. An active reference is held for each + * kernfs_node parameter. */ -struct kernfs_dir_ops { +struct kernfs_syscall_ops { + int (*remount_fs)(struct kernfs_root *root, int *flags, char *data); + int (*show_options)(struct seq_file *sf, struct kernfs_root *root); + int (*mkdir)(struct kernfs_node *parent, const char *name, umode_t mode); int (*rmdir)(struct kernfs_node *kn); @@ -129,16 +139,19 @@ struct kernfs_dir_ops { struct kernfs_root { /* published fields */ struct kernfs_node *kn; + unsigned int flags; /* KERNFS_ROOT_* flags */ /* private fields, do not use outside kernfs proper */ struct ida ino_ida; - struct kernfs_dir_ops *dir_ops; + struct kernfs_syscall_ops *syscall_ops; + wait_queue_head_t deactivate_waitq; }; struct kernfs_open_file { /* published fields */ struct kernfs_node *kn; struct file *file; + void *priv; /* private fields, do not use outside kernfs proper */ struct mutex mutex; @@ -171,9 +184,13 @@ struct kernfs_ops { loff_t off); /* - * write() is bounced through kernel buffer and a write larger than - * PAGE_SIZE results in partial operation of PAGE_SIZE. + * write() is bounced through kernel buffer. If atomic_write_len + * is not set, a write larger than PAGE_SIZE results in partial + * operations of PAGE_SIZE chunks. If atomic_write_len is set, + * writes upto the specified size are executed atomically but + * larger ones are rejected with -E2BIG. */ + size_t atomic_write_len; ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off); @@ -184,7 +201,7 @@ struct kernfs_ops { #endif }; -#ifdef CONFIG_SYSFS +#ifdef CONFIG_KERNFS static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) { @@ -217,13 +234,22 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) return kn->flags & KERNFS_NS; } +int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); +char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, + size_t buflen); +void pr_cont_kernfs_name(struct kernfs_node *kn); +void pr_cont_kernfs_path(struct kernfs_node *kn); +struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn); struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns); void kernfs_get(struct kernfs_node *kn); void kernfs_put(struct kernfs_node *kn); -struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, - void *priv); +struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry); +struct kernfs_root *kernfs_root_from_sb(struct super_block *sb); + +struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, + unsigned int flags, void *priv); void kernfs_destroy_root(struct kernfs_root *root); struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, @@ -239,7 +265,11 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target); +void kernfs_activate(struct kernfs_node *kn); void kernfs_remove(struct kernfs_node *kn); +void kernfs_break_active_protection(struct kernfs_node *kn); +void kernfs_unbreak_active_protection(struct kernfs_node *kn); +bool kernfs_remove_self(struct kernfs_node *kn); int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns); int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, @@ -254,7 +284,7 @@ void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); -#else /* CONFIG_SYSFS */ +#else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) { return 0; } /* whatever */ @@ -264,6 +294,19 @@ static inline void kernfs_enable_ns(struct kernfs_node *kn) { } static inline bool kernfs_ns_enabled(struct kernfs_node *kn) { return false; } +static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) +{ return -ENOSYS; } + +static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf, + size_t buflen) +{ return NULL; } + +static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { } +static inline void pr_cont_kernfs_path(struct kernfs_node *kn) { } + +static inline struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) +{ return NULL; } + static inline struct kernfs_node * kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) @@ -272,8 +315,15 @@ kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, static inline void kernfs_get(struct kernfs_node *kn) { } static inline void kernfs_put(struct kernfs_node *kn) { } +static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) +{ return NULL; } + +static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) +{ return NULL; } + static inline struct kernfs_root * -kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv) +kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, + void *priv) { return ERR_PTR(-ENOSYS); } static inline void kernfs_destroy_root(struct kernfs_root *root) { } @@ -295,8 +345,13 @@ kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target) { return ERR_PTR(-ENOSYS); } +static inline void kernfs_activate(struct kernfs_node *kn) { } + static inline void kernfs_remove(struct kernfs_node *kn) { } +static inline bool kernfs_remove_self(struct kernfs_node *kn) +{ return false; } + static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn, const char *name, const void *ns) { return -ENOSYS; } @@ -324,7 +379,7 @@ static inline void kernfs_kill_sb(struct super_block *sb) { } static inline void kernfs_init(void) { } -#endif /* CONFIG_SYSFS */ +#endif /* CONFIG_KERNFS */ static inline struct kernfs_node * kernfs_find_and_get(struct kernfs_node *kn, const char *name) @@ -366,6 +421,13 @@ static inline int kernfs_remove_by_name(struct kernfs_node *parent, return kernfs_remove_by_name_ns(parent, name, NULL); } +static inline int kernfs_rename(struct kernfs_node *kn, + struct kernfs_node *new_parent, + const char *new_name) +{ + return kernfs_rename_ns(kn, new_parent, new_name, NULL); +} + static inline struct dentry * kernfs_mount(struct file_system_type *fs_type, int flags, struct kernfs_root *root) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index abd0113b662..eccfb4a4b37 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -162,7 +162,7 @@ extern int do_swap_account; static inline bool mem_cgroup_disabled(void) { - if (mem_cgroup_subsys.disabled) + if (memory_cgrp_subsys.disabled) return true; return false; } diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h index 30b2ebee643..fdaa0c6fc7a 100644 --- a/include/linux/sysfs.h +++ b/include/linux/sysfs.h @@ -178,9 +178,6 @@ struct sysfs_ops { #ifdef CONFIG_SYSFS -int sysfs_schedule_callback(struct kobject *kobj, void (*func)(void *), - void *data, struct module *owner); - int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns); void sysfs_remove_dir(struct kobject *kobj); int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, @@ -198,6 +195,7 @@ int __must_check sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode); void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); +bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr); void sysfs_remove_files(struct kobject *kobj, const struct attribute **attr); int __must_check sysfs_create_bin_file(struct kobject *kobj, @@ -246,14 +244,13 @@ void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr); int __must_check sysfs_init(void); -#else /* CONFIG_SYSFS */ - -static inline int sysfs_schedule_callback(struct kobject *kobj, - void (*func)(void *), void *data, struct module *owner) +static inline void sysfs_enable_ns(struct kernfs_node *kn) { - return -ENOSYS; + return kernfs_enable_ns(kn); } +#else /* CONFIG_SYSFS */ + static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { return 0; @@ -301,6 +298,12 @@ static inline void sysfs_remove_file_ns(struct kobject *kobj, { } +static inline bool sysfs_remove_file_self(struct kobject *kobj, + const struct attribute *attr) +{ + return false; +} + static inline void sysfs_remove_files(struct kobject *kobj, const struct attribute **attr) { @@ -418,6 +421,10 @@ static inline int __must_check sysfs_init(void) return 0; } +static inline void sysfs_enable_ns(struct kernfs_node *kn) +{ +} + #endif /* CONFIG_SYSFS */ static inline int __must_check sysfs_create_file(struct kobject *kobj, diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index 9cf2d5ef38d..c15d39456e1 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -34,7 +34,7 @@ static inline u32 task_cls_classid(struct task_struct *p) return 0; rcu_read_lock(); - classid = container_of(task_css(p, net_cls_subsys_id), + classid = container_of(task_css(p, net_cls_cgrp_id), struct cgroup_cls_state, css)->classid; rcu_read_unlock(); diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index dafc09f0fdb..f2a9597ff53 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -27,32 +27,17 @@ struct netprio_map { void sock_update_netprioidx(struct sock *sk); -#if IS_BUILTIN(CONFIG_CGROUP_NET_PRIO) static inline u32 task_netprioidx(struct task_struct *p) { struct cgroup_subsys_state *css; u32 idx; rcu_read_lock(); - css = task_css(p, net_prio_subsys_id); + css = task_css(p, net_prio_cgrp_id); idx = css->cgroup->id; rcu_read_unlock(); return idx; } -#elif IS_MODULE(CONFIG_CGROUP_NET_PRIO) -static inline u32 task_netprioidx(struct task_struct *p) -{ - struct cgroup_subsys_state *css; - u32 idx = 0; - - rcu_read_lock(); - css = task_css(p, net_prio_subsys_id); - if (css) - idx = css->cgroup->id; - rcu_read_unlock(); - return idx; -} -#endif #else /* !CONFIG_CGROUP_NET_PRIO */ static inline u32 task_netprioidx(struct task_struct *p) { diff --git a/init/Kconfig b/init/Kconfig index 009a797dd24..3f74784560a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -854,6 +854,7 @@ config NUMA_BALANCING menuconfig CGROUPS boolean "Control Group support" + select KERNFS help This option adds support for grouping sets of processes together, for use with process control subsystems such as Cpusets, CFS, memory diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 105f273b6f8..8ab800c7bac 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -40,23 +40,21 @@ #include <linux/proc_fs.h> #include <linux/rcupdate.h> #include <linux/sched.h> -#include <linux/backing-dev.h> #include <linux/slab.h> -#include <linux/magic.h> #include <linux/spinlock.h> +#include <linux/rwsem.h> #include <linux/string.h> #include <linux/sort.h> #include <linux/kmod.h> -#include <linux/module.h> #include <linux/delayacct.h> #include <linux/cgroupstats.h> #include <linux/hashtable.h> -#include <linux/namei.h> #include <linux/pid_namespace.h> #include <linux/idr.h> #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/flex_array.h> /* used in cgroup_attach_task */ #include <linux/kthread.h> +#include <linux/delay.h> #include <linux/atomic.h> @@ -68,21 +66,21 @@ */ #define CGROUP_PIDLIST_DESTROY_DELAY HZ +#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ + MAX_CFTYPE_NAME + 2) + +/* + * cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file + * creation/removal and hierarchy changing operations including cgroup + * creation, removal, css association and controller rebinding. This outer + * lock is needed mainly to resolve the circular dependency between kernfs + * active ref and cgroup_mutex. cgroup_tree_mutex nests above both. + */ +static DEFINE_MUTEX(cgroup_tree_mutex); + /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. - * - * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify - * cgroupfs_root of any cgroup hierarchy - subsys list, flags, - * release_agent_path and so on. Modifying requires both cgroup_mutex and - * cgroup_root_mutex. Readers can acquire either of the two. This is to - * break the following locking order cycle. - * - * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem - * B. namespace_sem -> cgroup_mutex - * - * B happens only through cgroup_show_options() and using cgroup_root_mutex - * breaks it. */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); @@ -91,20 +89,17 @@ EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ static DEFINE_MUTEX(cgroup_mutex); #endif -static DEFINE_MUTEX(cgroup_root_mutex); +/* + * Protects cgroup_subsys->release_agent_path. Modifying it also requires + * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. + */ +static DEFINE_SPINLOCK(release_agent_path_lock); -#define cgroup_assert_mutex_or_rcu_locked() \ +#define cgroup_assert_mutexes_or_rcu_locked() \ rcu_lockdep_assert(rcu_read_lock_held() || \ + lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex), \ - "cgroup_mutex or RCU read lock required"); - -#ifdef CONFIG_LOCKDEP -#define cgroup_assert_mutex_or_root_locked() \ - WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&cgroup_mutex) && \ - !lockdep_is_held(&cgroup_root_mutex))) -#else -#define cgroup_assert_mutex_or_root_locked() do { } while (0) -#endif + "cgroup_[tree_]mutex or RCU read lock required"); /* * cgroup destruction makes heavy use of work items and there can be a lot @@ -120,17 +115,19 @@ static struct workqueue_struct *cgroup_destroy_wq; */ static struct workqueue_struct *cgroup_pidlist_destroy_wq; -/* - * Generate an array of cgroup subsystem pointers. At boot time, this is - * populated with the built in subsystems, and modular subsystems are - * registered after that. The mutable section of this array is protected by - * cgroup_mutex. - */ -#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, -#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) -static struct cgroup_subsys *cgroup_subsys[CGROUP_SUBSYS_COUNT] = { +/* generate an array of cgroup subsystem pointers */ +#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys, +static struct cgroup_subsys *cgroup_subsys[] = { +#include <linux/cgroup_subsys.h> +}; +#undef SUBSYS + +/* array of cgroup subsystem names */ +#define SUBSYS(_x) [_x ## _cgrp_id] = #_x, +static const char *cgroup_subsys_name[] = { #include <linux/cgroup_subsys.h> }; +#undef SUBSYS /* * The dummy hierarchy, reserved for the subsystems that are otherwise @@ -147,15 +144,9 @@ static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup; static LIST_HEAD(cgroup_roots); static int cgroup_root_count; -/* - * Hierarchy ID allocation and mapping. It follows the same exclusion - * rules as other root ops - both cgroup_mutex and cgroup_root_mutex for - * writes, either for reads. - */ +/* hierarchy ID allocation and mapping, protected by cgroup_mutex */ static DEFINE_IDR(cgroup_hierarchy_idr); -static struct cgroup_name root_cgroup_name = { .name = "/" }; - /* * Assign a monotonically increasing serial number to cgroups. It * guarantees cgroups with bigger numbers are newer than those with smaller @@ -175,11 +166,13 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; +static void cgroup_put(struct cgroup *cgrp); +static int rebind_subsystems(struct cgroupfs_root *root, + unsigned long added_mask, unsigned removed_mask); static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); -static int cgroup_file_release(struct inode *inode, struct file *file); static void cgroup_pidlist_destroy_all(struct cgroup *cgrp); /** @@ -197,8 +190,9 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, struct cgroup_subsys *ss) { if (ss) - return rcu_dereference_check(cgrp->subsys[ss->subsys_id], - lockdep_is_held(&cgroup_mutex)); + return rcu_dereference_check(cgrp->subsys[ss->id], + lockdep_is_held(&cgroup_tree_mutex) || + lockdep_is_held(&cgroup_mutex)); else return &cgrp->dummy_css; } @@ -209,6 +203,27 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp) return test_bit(CGRP_DEAD, &cgrp->flags); } +struct cgroup_subsys_state *seq_css(struct seq_file *seq) +{ + struct kernfs_open_file *of = seq->private; + struct cgroup *cgrp = of->kn->parent->priv; + struct cftype *cft = seq_cft(seq); + + /* + * This is open and unprotected implementation of cgroup_css(). + * seq_css() is only called from a kernfs file operation which has + * an active reference on the file. Because all the subsystem + * files are drained before a css is disassociated with a cgroup, + * the matching css from the cgroup's subsys table is guaranteed to + * be and stay valid until the enclosing operation is complete. + */ + if (cft->ss) + return rcu_dereference_raw(cgrp->subsys[cft->ss->id]); + else + return &cgrp->dummy_css; +} +EXPORT_SYMBOL_GPL(seq_css); + /** * cgroup_is_descendant - test ancestry * @cgrp: the cgroup to be tested @@ -227,7 +242,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) } return false; } -EXPORT_SYMBOL_GPL(cgroup_is_descendant); static int cgroup_is_releasable(const struct cgroup *cgrp) { @@ -254,54 +268,23 @@ static int notify_on_release(const struct cgroup *cgrp) for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ if (!((css) = rcu_dereference_check( \ (cgrp)->subsys[(ssid)], \ + lockdep_is_held(&cgroup_tree_mutex) || \ lockdep_is_held(&cgroup_mutex)))) { } \ else /** - * for_each_subsys - iterate all loaded cgroup subsystems + * for_each_subsys - iterate all enabled cgroup subsystems * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end - * - * Iterates through all loaded subsystems. Should be called under - * cgroup_mutex or cgroup_root_mutex. */ #define for_each_subsys(ss, ssid) \ - for (({ cgroup_assert_mutex_or_root_locked(); (ssid) = 0; }); \ - (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ - if (!((ss) = cgroup_subsys[(ssid)])) { } \ - else - -/** - * for_each_builtin_subsys - iterate all built-in cgroup subsystems - * @ss: the iteration cursor - * @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end - * - * Bulit-in subsystems are always present and iteration itself doesn't - * require any synchronization. - */ -#define for_each_builtin_subsys(ss, i) \ - for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \ - (((ss) = cgroup_subsys[i]) || true); (i)++) + for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \ + (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) /* iterate across the active hierarchies */ #define for_each_active_root(root) \ list_for_each_entry((root), &cgroup_roots, root_list) -static inline struct cgroup *__d_cgrp(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cfent *__d_cfe(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -static inline struct cftype *__d_cft(struct dentry *dentry) -{ - return __d_cfe(dentry)->type; -} - /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -358,11 +341,10 @@ static struct css_set init_css_set; static struct cgrp_cset_link init_cgrp_cset_link; /* - * css_set_lock protects the list of css_set objects, and the chain of - * tasks off each css_set. Nests outside task->alloc_lock due to - * css_task_iter_start(). + * css_set_rwsem protects the list of css_set objects, and the chain of + * tasks off each css_set. */ -static DEFINE_RWLOCK(css_set_lock); +static DECLARE_RWSEM(css_set_rwsem); static int css_set_count; /* @@ -386,30 +368,14 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -/* - * We don't maintain the lists running through each css_set to its task - * until after the first call to css_task_iter_start(). This reduces the - * fork()/exit() overhead for people who have cgroups compiled into their - * kernel but not actually in use. - */ -static int use_task_css_set_links __read_mostly; - -static void __put_css_set(struct css_set *cset, int taskexit) +static void put_css_set_locked(struct css_set *cset, bool taskexit) { struct cgrp_cset_link *link, *tmp_link; - /* - * Ensure that the refcount doesn't hit zero while any readers - * can see it. Similar to atomic_dec_and_lock(), but for an - * rwlock - */ - if (atomic_add_unless(&cset->refcount, -1, 1)) - return; - write_lock(&css_set_lock); - if (!atomic_dec_and_test(&cset->refcount)) { - write_unlock(&css_set_lock); + lockdep_assert_held(&css_set_rwsem); + + if (!atomic_dec_and_test(&cset->refcount)) return; - } /* This css_set is dead. unlink it and release cgroup refcounts */ hash_del(&cset->hlist); @@ -421,7 +387,7 @@ static void __put_css_set(struct css_set *cset, int taskexit) list_del(&link->cset_link); list_del(&link->cgrp_link); - /* @cgrp can't go away while we're holding css_set_lock */ + /* @cgrp can't go away while we're holding css_set_rwsem */ if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); @@ -431,10 +397,24 @@ static void __put_css_set(struct css_set *cset, int taskexit) kfree(link); } - write_unlock(&css_set_lock); kfree_rcu(cset, rcu_head); } +static void put_css_set(struct css_set *cset, bool taskexit) +{ + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cset->refcount, -1, 1)) + return; + + down_write(&css_set_rwsem); + put_css_set_locked(cset, taskexit); + up_write(&css_set_rwsem); +} + /* * refcounted get/put for css_set objects */ @@ -443,16 +423,6 @@ static inline void get_css_set(struct css_set *cset) atomic_inc(&cset->refcount); } -static inline void put_css_set(struct css_set *cset) -{ - __put_css_set(cset, 0); -} - -static inline void put_css_set_taskexit(struct css_set *cset) -{ - __put_css_set(cset, 1); -} - /** * compare_css_sets - helper function for find_existing_css_set(). * @cset: candidate css_set being tested @@ -652,11 +622,11 @@ static struct css_set *find_css_set(struct css_set *old_cset, /* First see if we already have a cgroup group that matches * the desired set */ - read_lock(&css_set_lock); + down_read(&css_set_rwsem); cset = find_existing_css_set(old_cset, cgrp, template); if (cset) get_css_set(cset); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); if (cset) return cset; @@ -680,7 +650,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, * find_existing_css_set() */ memcpy(cset->subsys, template, sizeof(cset->subsys)); - write_lock(&css_set_lock); + down_write(&css_set_rwsem); /* Add reference counts and links from the new css_set. */ list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -698,14 +668,98 @@ static struct css_set *find_css_set(struct css_set *old_cset, key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); return cset; } +static struct cgroupfs_root *cgroup_root_from_kf(struct kernfs_root *kf_root) +{ + struct cgroup *top_cgrp = kf_root->kn->priv; + + return top_cgrp->root; +} + +static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) +{ + int id; + + lockdep_assert_held(&cgroup_mutex); + + id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end, + GFP_KERNEL); + if (id < 0) + return id; + + root->hierarchy_id = id; + return 0; +} + +static void cgroup_exit_root_id(struct cgroupfs_root *root) +{ + lockdep_assert_held(&cgroup_mutex); + + if (root->hierarchy_id) { + idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); + root->hierarchy_id = 0; + } +} + +static void cgroup_free_root(struct cgroupfs_root *root) +{ + if (root) { + /* hierarhcy ID shoulid already have been released */ + WARN_ON_ONCE(root->hierarchy_id); + + idr_destroy(&root->cgroup_idr); + kfree(root); + } +} + +static void cgroup_destroy_root(struct cgroupfs_root *root) +{ + struct cgroup *cgrp = &root->top_cgroup; + struct cgrp_cset_link *link, *tmp_link; + + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); + + BUG_ON(atomic_read(&root->nr_cgrps)); + BUG_ON(!list_empty(&cgrp->children)); + + /* Rebind all subsystems back to the default hierarchy */ + WARN_ON(rebind_subsystems(root, 0, root->subsys_mask)); + + /* + * Release all the links from cset_links to this hierarchy's + * root cgroup + */ + down_write(&css_set_rwsem); + + list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { + list_del(&link->cset_link); + list_del(&link->cgrp_link); + kfree(link); + } + up_write(&css_set_rwsem); + + if (!list_empty(&root->root_list)) { + list_del(&root->root_list); + cgroup_root_count--; + } + + cgroup_exit_root_id(root); + + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + + kernfs_destroy_root(root->kf_root); + cgroup_free_root(root); +} + /* * Return the cgroup for "task" from the given hierarchy. Must be - * called with cgroup_mutex held. + * called with cgroup_mutex and css_set_rwsem held. */ static struct cgroup *task_cgroup_from_root(struct task_struct *task, struct cgroupfs_root *root) @@ -713,8 +767,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, struct css_set *cset; struct cgroup *res = NULL; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - read_lock(&css_set_lock); + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_rwsem); + /* * No need to lock the task - since we hold cgroup_mutex the * task can't change groups, so the only thing that can happen @@ -735,7 +790,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, } } } - read_unlock(&css_set_lock); + BUG_ON(!res); return res; } @@ -790,78 +845,71 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -/* - * A couple of forward declarations required, due to cyclic reference loop: - * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> - * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations - * -> cgroup_mkdir. - */ - -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); -static const struct inode_operations cgroup_dir_inode_operations; +static struct kernfs_syscall_ops cgroup_kf_syscall_ops; static const struct file_operations proc_cgroupstats_operations; -static struct backing_dev_info cgroup_backing_dev_info = { - .name = "cgroup", - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, -}; - -static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) +static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, + char *buf) { - struct inode *inode = new_inode(sb); - - if (inode) { - inode->i_ino = get_next_ino(); - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; - } - return inode; + if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && + !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) + snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", + cft->ss->name, cft->name); + else + strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + return buf; } -static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) +/** + * cgroup_file_mode - deduce file mode of a control file + * @cft: the control file in question + * + * returns cft->mode if ->mode is not 0 + * returns S_IRUGO|S_IWUSR if it has both a read and a write handler + * returns S_IRUGO if it has only a read handler + * returns S_IWUSR if it has only a write hander + */ +static umode_t cgroup_file_mode(const struct cftype *cft) { - struct cgroup_name *name; + umode_t mode = 0; - name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); - if (!name) - return NULL; - strcpy(name->name, dentry->d_name.name); - return name; + if (cft->mode) + return cft->mode; + + if (cft->read_u64 || cft->read_s64 || cft->seq_show) + mode |= S_IRUGO; + + if (cft->write_u64 || cft->write_s64 || cft->write_string || + cft->trigger) + mode |= S_IWUSR; + + return mode; } static void cgroup_free_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - mutex_lock(&cgroup_mutex); - cgrp->root->number_of_cgroups--; - mutex_unlock(&cgroup_mutex); - - /* - * We get a ref to the parent's dentry, and put the ref when - * this cgroup is being freed, so it's guaranteed that the - * parent won't be destroyed before its children. - */ - dput(cgrp->parent->dentry); - - /* - * Drop the active superblock reference that we took when we - * created the cgroup. This will free cgrp->root, if we are - * holding the last reference to @sb. - */ - deactivate_super(cgrp->root->sb); - + atomic_dec(&cgrp->root->nr_cgrps); cgroup_pidlist_destroy_all(cgrp); - simple_xattrs_free(&cgrp->xattrs); - - kfree(rcu_dereference_raw(cgrp->name)); - kfree(cgrp); + if (cgrp->parent) { + /* + * We get a ref to the parent, and put the ref when this + * cgroup is being freed, so it's guaranteed that the + * parent won't be destroyed before its children. + */ + cgroup_put(cgrp->parent); + kernfs_put(cgrp->kn); + kfree(cgrp); + } else { + /* + * This is top cgroup's refcnt reaching zero, which + * indicates that the root should be released. + */ + cgroup_destroy_root(cgrp->root); + } } static void cgroup_free_rcu(struct rcu_head *head) @@ -872,73 +920,40 @@ static void cgroup_free_rcu(struct rcu_head *head) queue_work(cgroup_destroy_wq, &cgrp->destroy_work); } -static void cgroup_diput(struct dentry *dentry, struct inode *inode) -{ - /* is dentry a directory ? if so, kfree() associated cgroup */ - if (S_ISDIR(inode->i_mode)) { - struct cgroup *cgrp = dentry->d_fsdata; - - BUG_ON(!(cgroup_is_dead(cgrp))); - - /* - * XXX: cgrp->id is only used to look up css's. As cgroup - * and css's lifetimes will be decoupled, it should be made - * per-subsystem and moved to css->id so that lookups are - * successful until the target css is released. - */ - mutex_lock(&cgroup_mutex); - idr_remove(&cgrp->root->cgroup_idr, cgrp->id); - mutex_unlock(&cgroup_mutex); - cgrp->id = -1; - - call_rcu(&cgrp->rcu_head, cgroup_free_rcu); - } else { - struct cfent *cfe = __d_cfe(dentry); - struct cgroup *cgrp = dentry->d_parent->d_fsdata; - - WARN_ONCE(!list_empty(&cfe->node) && - cgrp != &cgrp->root->top_cgroup, - "cfe still linked for %s\n", cfe->type->name); - simple_xattrs_free(&cfe->xattrs); - kfree(cfe); - } - iput(inode); -} - -static void remove_dir(struct dentry *d) +static void cgroup_get(struct cgroup *cgrp) { - struct dentry *parent = dget(d->d_parent); - - d_delete(d); - simple_rmdir(parent->d_inode, d); - dput(parent); + WARN_ON_ONCE(cgroup_is_dead(cgrp)); + WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0); + atomic_inc(&cgrp->refcnt); } -static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +static void cgroup_put(struct cgroup *cgrp) { - struct cfent *cfe; - - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); - lockdep_assert_held(&cgroup_mutex); + if (!atomic_dec_and_test(&cgrp->refcnt)) + return; + if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp))) + return; /* - * If we're doing cleanup due to failure of cgroup_create(), - * the corresponding @cfe may not exist. + * XXX: cgrp->id is only used to look up css's. As cgroup and + * css's lifetimes will be decoupled, it should be made + * per-subsystem and moved to css->id so that lookups are + * successful until the target css is released. */ - list_for_each_entry(cfe, &cgrp->files, node) { - struct dentry *d = cfe->dentry; + mutex_lock(&cgroup_mutex); + idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + mutex_unlock(&cgroup_mutex); + cgrp->id = -1; - if (cft && cfe->type != cft) - continue; + call_rcu(&cgrp->rcu_head, cgroup_free_rcu); +} - dget(d); - d_delete(d); - simple_unlink(cgrp->dentry->d_inode, d); - list_del_init(&cfe->node); - dput(d); +static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) +{ + char name[CGROUP_FILE_NAME_MAX]; - break; - } + lockdep_assert_held(&cgroup_tree_mutex); + kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } /** @@ -952,81 +967,41 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) int i; for_each_subsys(ss, i) { - struct cftype_set *set; + struct cftype *cfts; if (!test_bit(i, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, set->cfts, false); + list_for_each_entry(cfts, &ss->cfts, node) + cgroup_addrm_files(cgrp, cfts, false); } } -/* - * NOTE : the dentry must have been dget()'ed - */ -static void cgroup_d_remove_dir(struct dentry *dentry) -{ - struct dentry *parent; - - parent = dentry->d_parent; - spin_lock(&parent->d_lock); - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - list_del_init(&dentry->d_u.d_child); - spin_unlock(&dentry->d_lock); - spin_unlock(&parent->d_lock); - remove_dir(dentry); -} - -/* - * Call with cgroup_mutex held. Drops reference counts on modules, including - * any duplicate ones that parse_cgroupfs_options took. If this function - * returns an error, no reference counts are touched. - */ static int rebind_subsystems(struct cgroupfs_root *root, unsigned long added_mask, unsigned removed_mask) { struct cgroup *cgrp = &root->top_cgroup; struct cgroup_subsys *ss; - unsigned long pinned = 0; int i, ret; - BUG_ON(!mutex_is_locked(&cgroup_mutex)); - BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); + lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); /* Check that any added subsystems are currently free */ - for_each_subsys(ss, i) { - if (!(added_mask & (1 << i))) - continue; - - /* is the subsystem mounted elsewhere? */ - if (ss->root != &cgroup_dummy_root) { - ret = -EBUSY; - goto out_put; - } - - /* pin the module */ - if (!try_module_get(ss->module)) { - ret = -ENOENT; - goto out_put; - } - pinned |= 1 << i; - } - - /* subsys could be missing if unloaded between parsing and here */ - if (added_mask != pinned) { - ret = -ENOENT; - goto out_put; - } + for_each_subsys(ss, i) + if ((added_mask & (1 << i)) && ss->root != &cgroup_dummy_root) + return -EBUSY; ret = cgroup_populate_dir(cgrp, added_mask); if (ret) - goto out_put; + return ret; /* * Nothing can fail from this point on. Remove files for the * removed subsystems and rebind each subsystem. */ + mutex_unlock(&cgroup_mutex); cgroup_clear_dir(cgrp, removed_mask); + mutex_lock(&cgroup_mutex); for_each_subsys(ss, i) { unsigned long bit = 1UL << i; @@ -1059,35 +1034,21 @@ static int rebind_subsystems(struct cgroupfs_root *root, RCU_INIT_POINTER(cgrp->subsys[i], NULL); cgroup_subsys[i]->root = &cgroup_dummy_root; - - /* subsystem is now free - drop reference on module */ - module_put(ss->module); root->subsys_mask &= ~bit; } } - /* - * Mark @root has finished binding subsystems. @root->subsys_mask - * now matches the bound subsystems. - */ - root->flags |= CGRP_ROOT_SUBSYS_BOUND; - + kernfs_activate(cgrp->kn); return 0; - -out_put: - for_each_subsys(ss, i) - if (pinned & (1 << i)) - module_put(ss->module); - return ret; } -static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) +static int cgroup_show_options(struct seq_file *seq, + struct kernfs_root *kf_root) { - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; + struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); struct cgroup_subsys *ss; int ssid; - mutex_lock(&cgroup_root_mutex); for_each_subsys(ss, ssid) if (root->subsys_mask & (1 << ssid)) seq_printf(seq, ",%s", ss->name); @@ -1097,13 +1058,16 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); + + spin_lock(&release_agent_path_lock); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); + spin_unlock(&release_agent_path_lock); + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); - mutex_unlock(&cgroup_root_mutex); return 0; } @@ -1115,9 +1079,6 @@ struct cgroup_sb_opts { char *name; /* User explicitly requested empty subsystem */ bool none; - - struct cgroupfs_root *new_root; - }; /* @@ -1137,7 +1098,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) BUG_ON(!mutex_is_locked(&cgroup_mutex)); #ifdef CONFIG_CPUSETS - mask = ~(1UL << cpuset_subsys_id); + mask = ~(1UL << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -1242,13 +1203,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - if (opts->flags & CGRP_ROOT_NOPREFIX) { - pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); - return -EINVAL; - } - - if (opts->cpuset_clone_children) { - pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); + if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || + opts->cpuset_clone_children || opts->release_agent || + opts->name) { + pr_err("cgroup: sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); return -EINVAL; } } @@ -1276,11 +1234,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return 0; } -static int cgroup_remount(struct super_block *sb, int *flags, char *data) +static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) { int ret = 0; - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; + struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; unsigned long added_mask, removed_mask; @@ -1289,9 +1246,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) return -EINVAL; } - mutex_lock(&cgrp->dentry->d_inode->i_mutex); + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); /* See what subsystems are wanted */ ret = parse_cgroupfs_options(data, &opts); @@ -1316,7 +1272,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) } /* remounting is not allowed for populated hierarchies */ - if (root->number_of_cgroups > 1) { + if (!list_empty(&root->top_cgroup.children)) { ret = -EBUSY; goto out_unlock; } @@ -1325,35 +1281,81 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) goto out_unlock; - if (opts.release_agent) + if (opts.release_agent) { + spin_lock(&release_agent_path_lock); strcpy(root->release_agent_path, opts.release_agent); + spin_unlock(&release_agent_path_lock); + } out_unlock: kfree(opts.release_agent); kfree(opts.name); - mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + mutex_unlock(&cgroup_tree_mutex); return ret; } -static const struct super_operations cgroup_ops = { - .statfs = simple_statfs, - .drop_inode = generic_delete_inode, - .show_options = cgroup_show_options, - .remount_fs = cgroup_remount, -}; +/* + * To reduce the fork() overhead for systems that are not actually using + * their cgroups capability, we don't maintain the lists running through + * each css_set to its tasks until we see the list actually used - in other + * words after the first mount. + */ +static bool use_task_css_set_links __read_mostly; + +static void cgroup_enable_task_cg_lists(void) +{ + struct task_struct *p, *g; + + down_write(&css_set_rwsem); + + if (use_task_css_set_links) + goto out_unlock; + + use_task_css_set_links = true; + + /* + * We need tasklist_lock because RCU is not safe against + * while_each_thread(). Besides, a forking task that has passed + * cgroup_post_fork() without seeing use_task_css_set_links = 1 + * is not guaranteed to have its child immediately visible in the + * tasklist if we walk through it with RCU. + */ + read_lock(&tasklist_lock); + do_each_thread(g, p) { + task_lock(p); + + WARN_ON_ONCE(!list_empty(&p->cg_list) || + task_css_set(p) != &init_css_set); + + /* + * We should check if the process is exiting, otherwise + * it will race with cgroup_exit() in that the list + * entry won't be deleted though the process has exited. + * Do it while holding siglock so that we don't end up + * racing against cgroup_exit(). + */ + spin_lock_irq(&p->sighand->siglock); + if (!(p->flags & PF_EXITING)) + list_add(&p->cg_list, &task_css_set(p)->tasks); + spin_unlock_irq(&p->sighand->siglock); + + task_unlock(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); +out_unlock: + up_write(&css_set_rwsem); +} static void init_cgroup_housekeeping(struct cgroup *cgrp) { + atomic_set(&cgrp->refcnt, 1); INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); - INIT_LIST_HEAD(&cgrp->files); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); cgrp->dummy_css.cgroup = cgrp; - simple_xattrs_init(&cgrp->xattrs); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -1361,66 +1363,18 @@ static void init_cgroup_root(struct cgroupfs_root *root) struct cgroup *cgrp = &root->top_cgroup; INIT_LIST_HEAD(&root->root_list); - root->number_of_cgroups = 1; + atomic_set(&root->nr_cgrps, 1); cgrp->root = root; - RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); init_cgroup_housekeeping(cgrp); idr_init(&root->cgroup_idr); } -static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) -{ - int id; - - lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&cgroup_root_mutex); - - id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end, - GFP_KERNEL); - if (id < 0) - return id; - - root->hierarchy_id = id; - return 0; -} - -static void cgroup_exit_root_id(struct cgroupfs_root *root) -{ - lockdep_assert_held(&cgroup_mutex); - lockdep_assert_held(&cgroup_root_mutex); - - if (root->hierarchy_id) { - idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id); - root->hierarchy_id = 0; - } -} - -static int cgroup_test_super(struct super_block *sb, void *data) -{ - struct cgroup_sb_opts *opts = data; - struct cgroupfs_root *root = sb->s_fs_info; - - /* If we asked for a name then it must match */ - if (opts->name && strcmp(opts->name, root->name)) - return 0; - - /* - * If we asked for subsystems (or explicitly for no - * subsystems) then they must match - */ - if ((opts->subsys_mask || opts->none) - && (opts->subsys_mask != root->subsys_mask)) - return 0; - - return 1; -} - static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) { struct cgroupfs_root *root; if (!opts->subsys_mask && !opts->none) - return NULL; + return ERR_PTR(-EINVAL); root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) @@ -1428,15 +1382,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) init_cgroup_root(root); - /* - * We need to set @root->subsys_mask now so that @root can be - * matched by cgroup_test_super() before it finishes - * initialization; otherwise, competing mounts with the same - * options may try to bind the same subsystems instead of waiting - * for the first one leading to unexpected mount errors. - * SUBSYS_BOUND will be set once actual binding is complete. - */ - root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); @@ -1447,291 +1392,202 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) return root; } -static void cgroup_free_root(struct cgroupfs_root *root) +static int cgroup_setup_root(struct cgroupfs_root *root, unsigned long ss_mask) { - if (root) { - /* hierarhcy ID shoulid already have been released */ - WARN_ON_ONCE(root->hierarchy_id); - - idr_destroy(&root->cgroup_idr); - kfree(root); - } -} + LIST_HEAD(tmp_links); + struct cgroup *root_cgrp = &root->top_cgroup; + struct css_set *cset; + int i, ret; -static int cgroup_set_super(struct super_block *sb, void *data) -{ - int ret; - struct cgroup_sb_opts *opts = data; + lockdep_assert_held(&cgroup_tree_mutex); + lockdep_assert_held(&cgroup_mutex); - /* If we don't have a new root, we can't set up a new sb */ - if (!opts->new_root) - return -EINVAL; + ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); + if (ret < 0) + goto out; + root_cgrp->id = ret; - BUG_ON(!opts->subsys_mask && !opts->none); + /* + * We're accessing css_set_count without locking css_set_rwsem here, + * but that's OK - it can only be increased by someone holding + * cgroup_lock, and that's us. The worst that can happen is that we + * have some link structures left over + */ + ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); + if (ret) + goto out; - ret = set_anon_super(sb, NULL); + /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ + ret = cgroup_init_root_id(root, 2, 0); if (ret) - return ret; + goto out; - sb->s_fs_info = opts->new_root; - opts->new_root->sb = sb; + root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED, + root_cgrp); + if (IS_ERR(root->kf_root)) { + ret = PTR_ERR(root->kf_root); + goto exit_root_id; + } + root_cgrp->kn = root->kf_root->kn; - sb->s_blocksize = PAGE_CACHE_SIZE; - sb->s_blocksize_bits = PAGE_CACHE_SHIFT; - sb->s_magic = CGROUP_SUPER_MAGIC; - sb->s_op = &cgroup_ops; + ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); + if (ret) + goto destroy_root; - return 0; -} + ret = rebind_subsystems(root, ss_mask, 0); + if (ret) + goto destroy_root; -static int cgroup_get_rootdir(struct super_block *sb) -{ - static const struct dentry_operations cgroup_dops = { - .d_iput = cgroup_diput, - .d_delete = always_delete_dentry, - }; + /* + * There must be no failure case after here, since rebinding takes + * care of subsystems' refcounts, which are explicitly dropped in + * the failure exit path. + */ + list_add(&root->root_list, &cgroup_roots); + cgroup_root_count++; - struct inode *inode = - cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); + /* + * Link the top cgroup in this hierarchy into all the css_set + * objects. + */ + down_write(&css_set_rwsem); + hash_for_each(css_set_table, i, cset, hlist) + link_css_set(&tmp_links, cset, root_cgrp); + up_write(&css_set_rwsem); - if (!inode) - return -ENOMEM; + BUG_ON(!list_empty(&root_cgrp->children)); + BUG_ON(atomic_read(&root->nr_cgrps) != 1); - inode->i_fop = &simple_dir_operations; - inode->i_op = &cgroup_dir_inode_operations; - /* directories start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - sb->s_root = d_make_root(inode); - if (!sb->s_root) - return -ENOMEM; - /* for everything else we want ->d_op set */ - sb->s_d_op = &cgroup_dops; - return 0; + kernfs_activate(root_cgrp->kn); + ret = 0; + goto out; + +destroy_root: + kernfs_destroy_root(root->kf_root); + root->kf_root = NULL; +exit_root_id: + cgroup_exit_root_id(root); +out: + free_cgrp_cset_links(&tmp_links); + return ret; } static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { - struct cgroup_sb_opts opts; struct cgroupfs_root *root; - int ret = 0; - struct super_block *sb; - struct cgroupfs_root *new_root; - struct list_head tmp_links; - struct inode *inode; - const struct cred *cred; - - /* First find the desired set of subsystems */ - mutex_lock(&cgroup_mutex); - ret = parse_cgroupfs_options(data, &opts); - mutex_unlock(&cgroup_mutex); - if (ret) - goto out_err; + struct cgroup_sb_opts opts; + struct dentry *dentry; + int ret; /* - * Allocate a new cgroup root. We may not need it if we're - * reusing an existing hierarchy. + * The first time anyone tries to mount a cgroup, enable the list + * linking each css_set to its tasks and fix up all existing tasks. */ - new_root = cgroup_root_from_opts(&opts); - if (IS_ERR(new_root)) { - ret = PTR_ERR(new_root); - goto out_err; - } - opts.new_root = new_root; - - /* Locate an existing or new sb for this hierarchy */ - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); - if (IS_ERR(sb)) { - ret = PTR_ERR(sb); - cgroup_free_root(opts.new_root); - goto out_err; - } - - root = sb->s_fs_info; - BUG_ON(!root); - if (root == opts.new_root) { - /* We used the new root structure, so this is a new hierarchy */ - struct cgroup *root_cgrp = &root->top_cgroup; - struct cgroupfs_root *existing_root; - int i; - struct css_set *cset; - - BUG_ON(sb->s_root != NULL); - - ret = cgroup_get_rootdir(sb); - if (ret) - goto drop_new_super; - inode = sb->s_root->d_inode; - - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - - ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL); - if (ret < 0) - goto unlock_drop; - root_cgrp->id = ret; - - /* Check for name clashes with existing mounts */ - ret = -EBUSY; - if (strlen(root->name)) - for_each_active_root(existing_root) - if (!strcmp(existing_root->name, root->name)) - goto unlock_drop; - - /* - * We're accessing css_set_count without locking - * css_set_lock here, but that's OK - it can only be - * increased by someone holding cgroup_lock, and - * that's us. The worst that can happen is that we - * have some link structures left over - */ - ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); - if (ret) - goto unlock_drop; - - /* ID 0 is reserved for dummy root, 1 for unified hierarchy */ - ret = cgroup_init_root_id(root, 2, 0); - if (ret) - goto unlock_drop; - - sb->s_root->d_fsdata = root_cgrp; - root_cgrp->dentry = sb->s_root; - - /* - * We're inside get_sb() and will call lookup_one_len() to - * create the root files, which doesn't work if SELinux is - * in use. The following cred dancing somehow works around - * it. See 2ce9738ba ("cgroupfs: use init_cred when - * populating new cgroupfs mount") for more details. - */ - cred = override_creds(&init_cred); - - ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); - if (ret) - goto rm_base_files; + if (!use_task_css_set_links) + cgroup_enable_task_cg_lists(); +retry: + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); - ret = rebind_subsystems(root, root->subsys_mask, 0); - if (ret) - goto rm_base_files; + /* First find the desired set of subsystems */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; - revert_creds(cred); + /* look for a matching existing root */ + for_each_active_root(root) { + bool name_match = false; /* - * There must be no failure case after here, since rebinding - * takes care of subsystems' refcounts, which are explicitly - * dropped in the failure exit path. + * If we asked for a name then it must match. Also, if + * name matches but sybsys_mask doesn't, we should fail. + * Remember whether name matched. */ + if (opts.name) { + if (strcmp(opts.name, root->name)) + continue; + name_match = true; + } - list_add(&root->root_list, &cgroup_roots); - cgroup_root_count++; - - /* Link the top cgroup in this hierarchy into all - * the css_set objects */ - write_lock(&css_set_lock); - hash_for_each(css_set_table, i, cset, hlist) - link_css_set(&tmp_links, cset, root_cgrp); - write_unlock(&css_set_lock); - - free_cgrp_cset_links(&tmp_links); - - BUG_ON(!list_empty(&root_cgrp->children)); - BUG_ON(root->number_of_cgroups != 1); - - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - } else { /* - * We re-used an existing hierarchy - the new root (if - * any) is not needed + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match. */ - cgroup_free_root(opts.new_root); + if ((opts.subsys_mask || opts.none) && + (opts.subsys_mask != root->subsys_mask)) { + if (!name_match) + continue; + ret = -EBUSY; + goto out_unlock; + } if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); ret = -EINVAL; - goto drop_new_super; + goto out_unlock; } else { pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } - } - kfree(opts.release_agent); - kfree(opts.name); - return dget(sb->s_root); - - rm_base_files: - free_cgrp_cset_links(&tmp_links); - cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); - revert_creds(cred); - unlock_drop: - cgroup_exit_root_id(root); - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - drop_new_super: - deactivate_locked_super(sb); - out_err: - kfree(opts.release_agent); - kfree(opts.name); - return ERR_PTR(ret); -} - -static void cgroup_kill_sb(struct super_block *sb) -{ - struct cgroupfs_root *root = sb->s_fs_info; - struct cgroup *cgrp = &root->top_cgroup; - struct cgrp_cset_link *link, *tmp_link; - int ret; - - BUG_ON(!root); - - BUG_ON(root->number_of_cgroups != 1); - BUG_ON(!list_empty(&cgrp->children)); + /* + * A root's lifetime is governed by its top cgroup. Zero + * ref indicate that the root is being destroyed. Wait for + * destruction to complete so that the subsystems are free. + * We can use wait_queue for the wait but this path is + * super cold. Let's just sleep for a bit and retry. + */ + if (!atomic_inc_not_zero(&root->top_cgroup.refcnt)) { + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + kfree(opts.release_agent); + kfree(opts.name); + msleep(10); + goto retry; + } - mutex_lock(&cgrp->dentry->d_inode->i_mutex); - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); + ret = 0; + goto out_unlock; + } - /* Rebind all subsystems back to the default hierarchy */ - if (root->flags & CGRP_ROOT_SUBSYS_BOUND) { - ret = rebind_subsystems(root, 0, root->subsys_mask); - /* Shouldn't be able to fail ... */ - BUG_ON(ret); + /* no such thing, create a new one */ + root = cgroup_root_from_opts(&opts); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto out_unlock; } - /* - * Release all the links from cset_links to this hierarchy's - * root cgroup - */ - write_lock(&css_set_lock); + ret = cgroup_setup_root(root, opts.subsys_mask); + if (ret) + cgroup_free_root(root); - list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) { - list_del(&link->cset_link); - list_del(&link->cgrp_link); - kfree(link); - } - write_unlock(&css_set_lock); +out_unlock: + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); - if (!list_empty(&root->root_list)) { - list_del(&root->root_list); - cgroup_root_count--; - } + kfree(opts.release_agent); + kfree(opts.name); - cgroup_exit_root_id(root); + if (ret) + return ERR_PTR(ret); - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + dentry = kernfs_mount(fs_type, flags, root->kf_root); + if (IS_ERR(dentry)) + cgroup_put(&root->top_cgroup); + return dentry; +} - simple_xattrs_free(&cgrp->xattrs); +static void cgroup_kill_sb(struct super_block *sb) +{ + struct kernfs_root *kf_root = kernfs_root_from_sb(sb); + struct cgroupfs_root *root = cgroup_root_from_kf(kf_root); - kill_litter_super(sb); - cgroup_free_root(root); + cgroup_put(&root->top_cgroup); + kernfs_kill_sb(sb); } static struct file_system_type cgroup_fs_type = { @@ -1743,57 +1599,6 @@ static struct file_system_type cgroup_fs_type = { static struct kobject *cgroup_kobj; /** - * cgroup_path - generate the path of a cgroup - * @cgrp: the cgroup in question - * @buf: the buffer to write the path into - * @buflen: the length of the buffer - * - * Writes path of cgroup into buf. Returns 0 on success, -errno on error. - * - * We can't generate cgroup path using dentry->d_name, as accessing - * dentry->name must be protected by irq-unsafe dentry->d_lock or parent - * inode's i_mutex, while on the other hand cgroup_path() can be called - * with some irq-safe spinlocks held. - */ -int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) -{ - int ret = -ENAMETOOLONG; - char *start; - - if (!cgrp->parent) { - if (strlcpy(buf, "/", buflen) >= buflen) - return -ENAMETOOLONG; - return 0; - } - - start = buf + buflen - 1; - *start = '\0'; - - rcu_read_lock(); - do { - const char *name = cgroup_name(cgrp); - int len; - - len = strlen(name); - if ((start -= len) < buf) - goto out; - memcpy(start, name, len); - - if (--start < buf) - goto out; - *start = '/'; - - cgrp = cgrp->parent; - } while (cgrp->parent); - ret = 0; - memmove(buf, start, buf + buflen - start); -out: - rcu_read_unlock(); - return ret; -} -EXPORT_SYMBOL_GPL(cgroup_path); - -/** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task * @buf: the buffer to write the path into @@ -1804,31 +1609,32 @@ EXPORT_SYMBOL_GPL(cgroup_path); * function grabs cgroup_mutex and shouldn't be used inside locks used by * cgroup controller callbacks. * - * Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short. + * Return value is the same as kernfs_path(). */ -int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) +char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) { struct cgroupfs_root *root; struct cgroup *cgrp; - int hierarchy_id = 1, ret = 0; - - if (buflen < 2) - return -ENAMETOOLONG; + int hierarchy_id = 1; + char *path = NULL; mutex_lock(&cgroup_mutex); + down_read(&css_set_rwsem); root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); if (root) { cgrp = task_cgroup_from_root(task, root); - ret = cgroup_path(cgrp, buf, buflen); + path = cgroup_path(cgrp, buf, buflen); } else { /* if no hierarchy exists, everyone is in "/" */ - memcpy(buf, "/", 2); + if (strlcpy(buf, "/", buflen) < buflen) + path = buf; } + up_read(&css_set_rwsem); mutex_unlock(&cgroup_mutex); - return ret; + return path; } EXPORT_SYMBOL_GPL(task_cgroup_path); @@ -1846,7 +1652,6 @@ struct cgroup_taskset { struct flex_array *tc_array; int tc_array_len; int idx; - struct cgroup *cur_cgrp; }; /** @@ -1861,11 +1666,9 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) tset->idx = 0; return cgroup_taskset_next(tset); } else { - tset->cur_cgrp = tset->single.cgrp; return tset->single.task; } } -EXPORT_SYMBOL_GPL(cgroup_taskset_first); /** * cgroup_taskset_next - iterate to the next task in taskset @@ -1882,42 +1685,16 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) return NULL; tc = flex_array_get(tset->tc_array, tset->idx++); - tset->cur_cgrp = tc->cgrp; return tc->task; } -EXPORT_SYMBOL_GPL(cgroup_taskset_next); /** - * cgroup_taskset_cur_css - return the matching css for the current task - * @tset: taskset of interest - * @subsys_id: the ID of the target subsystem - * - * Return the css for the current (last returned) task of @tset for - * subsystem specified by @subsys_id. This function must be preceded by - * either cgroup_taskset_first() or cgroup_taskset_next(). - */ -struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, - int subsys_id) -{ - return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); -} -EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); - -/** - * cgroup_taskset_size - return the number of tasks in taskset - * @tset: taskset of interest - */ -int cgroup_taskset_size(struct cgroup_taskset *tset) -{ - return tset->tc_array ? tset->tc_array_len : 1; -} -EXPORT_SYMBOL_GPL(cgroup_taskset_size); - - -/* * cgroup_task_migrate - move a task from one cgroup to another. + * @old_cgrp; the cgroup @tsk is being migrated from + * @tsk: the task being migrated + * @new_cset: the new css_set @tsk is being attached to * - * Must be called with cgroup_mutex and threadgroup locked. + * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked. */ static void cgroup_task_migrate(struct cgroup *old_cgrp, struct task_struct *tsk, @@ -1925,6 +1702,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, { struct css_set *old_cset; + lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&css_set_rwsem); + /* * We are synchronized through threadgroup_lock() against PF_EXITING * setting such that we can't race against cgroup_exit() changing the @@ -1937,11 +1717,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, rcu_assign_pointer(tsk->cgroups, new_cset); task_unlock(tsk); - /* Update the css_set linked lists if we're using them */ - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) - list_move(&tsk->cg_list, &new_cset->tasks); - write_unlock(&css_set_lock); + list_move(&tsk->cg_list, &new_cset->tasks); /* * We just gained a reference on old_cset by taking it from the @@ -1949,26 +1725,26 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp, * we're safe to drop it here; it will be freed under RCU. */ set_bit(CGRP_RELEASABLE, &old_cgrp->flags); - put_css_set(old_cset); + put_css_set_locked(old_cset, false); } /** * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @cgrp: the cgroup to attach to - * @tsk: the task or the leader of the threadgroup to be attached + * @leader: the task or the leader of the threadgroup to be attached * @threadgroup: attach the whole threadgroup? * * Call holding cgroup_mutex and the group_rwsem of the leader. Will take * task_lock of @tsk or each thread in the threadgroup individually in turn. */ -static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, +static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *leader, bool threadgroup) { - int retval, i, group_size; + int ret, i, group_size; struct cgroupfs_root *root = cgrp->root; struct cgroup_subsys_state *css, *failed_css = NULL; /* threadgroup list cursor and array */ - struct task_struct *leader = tsk; + struct task_struct *task; struct task_and_cgroup *tc; struct flex_array *group; struct cgroup_taskset tset = { }; @@ -1981,7 +1757,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * threads exit, this will just be an over-estimate. */ if (threadgroup) - group_size = get_nr_threads(tsk); + group_size = get_nr_threads(leader); else group_size = 1; /* flex_array supports very large thread-groups better than kmalloc. */ @@ -1989,8 +1765,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, if (!group) return -ENOMEM; /* pre-allocate to guarantee space while iterating in rcu read-side. */ - retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); - if (retval) + ret = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); + if (ret) goto out_free_group_list; i = 0; @@ -1999,18 +1775,20 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * already PF_EXITING could be freed from underneath us unless we * take an rcu_read_lock. */ + down_read(&css_set_rwsem); rcu_read_lock(); + task = leader; do { struct task_and_cgroup ent; - /* @tsk either already exited or can't exit until the end */ - if (tsk->flags & PF_EXITING) + /* @task either already exited or can't exit until the end */ + if (task->flags & PF_EXITING) goto next; /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); - ent.task = tsk; - ent.cgrp = task_cgroup_from_root(tsk, root); + ent.task = task; + ent.cgrp = task_cgroup_from_root(task, root); /* nothing to do if this task is already in the cgroup */ if (ent.cgrp == cgrp) goto next; @@ -2018,21 +1796,22 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * saying GFP_ATOMIC has no effect here because we did prealloc * earlier, but it's good form to communicate our expectations. */ - retval = flex_array_put(group, i, &ent, GFP_ATOMIC); - BUG_ON(retval != 0); + ret = flex_array_put(group, i, &ent, GFP_ATOMIC); + BUG_ON(ret != 0); i++; next: if (!threadgroup) break; - } while_each_thread(leader, tsk); + } while_each_thread(leader, task); rcu_read_unlock(); + up_read(&css_set_rwsem); /* remember the number of threads in the array for later. */ group_size = i; tset.tc_array = group; tset.tc_array_len = group_size; /* methods shouldn't be called if no task is actually migrating */ - retval = 0; + ret = 0; if (!group_size) goto out_free_group_list; @@ -2041,8 +1820,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, */ for_each_css(css, i, cgrp) { if (css->ss->can_attach) { - retval = css->ss->can_attach(css, &tset); - if (retval) { + ret = css->ss->can_attach(css, &tset); + if (ret) { failed_css = css; goto out_cancel_attach; } @@ -2060,7 +1839,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, old_cset = task_css_set(tc->task); tc->cset = find_css_set(old_cset, cgrp); if (!tc->cset) { - retval = -ENOMEM; + ret = -ENOMEM; goto out_put_css_set_refs; } } @@ -2070,10 +1849,12 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * proceed to move all tasks to the new cgroup. There are no * failure cases after here, so this is the commit point. */ + down_write(&css_set_rwsem); for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); } + up_write(&css_set_rwsem); /* nothing is sensitive to fork() after this point. */ /* @@ -2086,18 +1867,18 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* * step 5: success! and cleanup */ - retval = 0; + ret = 0; out_put_css_set_refs: - if (retval) { + if (ret) { for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); if (!tc->cset) break; - put_css_set(tc->cset); + put_css_set(tc->cset, false); } } out_cancel_attach: - if (retval) { + if (ret) { for_each_css(css, i, cgrp) { if (css == failed_css) break; @@ -2107,7 +1888,7 @@ out_cancel_attach: } out_free_group_list: flex_array_free(group); - return retval; + return ret; } /* @@ -2203,7 +1984,11 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) mutex_lock(&cgroup_mutex); for_each_active_root(root) { - struct cgroup *from_cgrp = task_cgroup_from_root(from, root); + struct cgroup *from_cgrp; + + down_read(&css_set_rwsem); + from_cgrp = task_cgroup_from_root(from, root); + up_read(&css_set_rwsem); retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) @@ -2230,14 +2015,15 @@ static int cgroup_procs_write(struct cgroup_subsys_state *css, static int cgroup_release_agent_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); - if (strlen(buffer) >= PATH_MAX) - return -EINVAL; + struct cgroupfs_root *root = css->cgroup->root; + + BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX); if (!cgroup_lock_live_group(css->cgroup)) return -ENODEV; - mutex_lock(&cgroup_root_mutex); - strcpy(css->cgroup->root->release_agent_path, buffer); - mutex_unlock(&cgroup_root_mutex); + spin_lock(&release_agent_path_lock); + strlcpy(root->release_agent_path, buffer, + sizeof(root->release_agent_path)); + spin_unlock(&release_agent_path_lock); mutex_unlock(&cgroup_mutex); return 0; } @@ -2262,32 +2048,23 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } -/* A buffer size big enough for numbers or short strings */ -#define CGROUP_LOCAL_BUFFER_SIZE 64 - -static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, - size_t nbytes, loff_t *ppos) +static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) { - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; - size_t max_bytes = cft->max_write_len ?: CGROUP_LOCAL_BUFFER_SIZE - 1; - char *buf; + struct cgroup *cgrp = of->kn->parent->priv; + struct cftype *cft = of->kn->priv; + struct cgroup_subsys_state *css; int ret; - if (nbytes >= max_bytes) - return -E2BIG; - - buf = kmalloc(nbytes + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, userbuf, nbytes)) { - ret = -EFAULT; - goto out_free; - } - - buf[nbytes] = '\0'; + /* + * kernfs guarantees that a file isn't deleted with operations in + * flight, which means that the matching css is and stays alive and + * doesn't need to be pinned. The RCU locking is not necessary + * either. It's just for the convenience of using cgroup_css(). + */ + rcu_read_lock(); + css = cgroup_css(cgrp, cft->ss); + rcu_read_unlock(); if (cft->write_string) { ret = cft->write_string(css, cft, strstrip(buf)); @@ -2306,53 +2083,23 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *userbuf, } else { ret = -EINVAL; } -out_free: - kfree(buf); + return ret ?: nbytes; } -/* - * seqfile ops/methods for returning structured data. Currently just - * supports string->u64 maps, but can be extended in future. - */ - static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos) { - struct cftype *cft = seq_cft(seq); - - if (cft->seq_start) { - return cft->seq_start(seq, ppos); - } else { - /* - * The same behavior and code as single_open(). Returns - * !NULL if pos is at the beginning; otherwise, NULL. - */ - return NULL + !*ppos; - } + return seq_cft(seq)->seq_start(seq, ppos); } static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos) { - struct cftype *cft = seq_cft(seq); - - if (cft->seq_next) { - return cft->seq_next(seq, v, ppos); - } else { - /* - * The same behavior and code as single_open(), always - * terminate after the initial read. - */ - ++*ppos; - return NULL; - } + return seq_cft(seq)->seq_next(seq, v, ppos); } static void cgroup_seqfile_stop(struct seq_file *seq, void *v) { - struct cftype *cft = seq_cft(seq); - - if (cft->seq_stop) - cft->seq_stop(seq, v); + seq_cft(seq)->seq_stop(seq, v); } static int cgroup_seqfile_show(struct seq_file *m, void *arg) @@ -2372,96 +2119,35 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) return 0; } -static struct seq_operations cgroup_seq_operations = { - .start = cgroup_seqfile_start, - .next = cgroup_seqfile_next, - .stop = cgroup_seqfile_stop, - .show = cgroup_seqfile_show, +static struct kernfs_ops cgroup_kf_single_ops = { + .atomic_write_len = PAGE_SIZE, + .write = cgroup_file_write, + .seq_show = cgroup_seqfile_show, }; -static int cgroup_file_open(struct inode *inode, struct file *file) -{ - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); - struct cgroup_subsys_state *css; - struct cgroup_open_file *of; - int err; - - err = generic_file_open(inode, file); - if (err) - return err; - - /* - * If the file belongs to a subsystem, pin the css. Will be - * unpinned either on open failure or release. This ensures that - * @css stays alive for all file operations. - */ - rcu_read_lock(); - css = cgroup_css(cgrp, cft->ss); - if (cft->ss && !css_tryget(css)) - css = NULL; - rcu_read_unlock(); - - if (!css) - return -ENODEV; - - /* - * @cfe->css is used by read/write/close to determine the - * associated css. @file->private_data would be a better place but - * that's already used by seqfile. Multiple accessors may use it - * simultaneously which is okay as the association never changes. - */ - WARN_ON_ONCE(cfe->css && cfe->css != css); - cfe->css = css; - - of = __seq_open_private(file, &cgroup_seq_operations, - sizeof(struct cgroup_open_file)); - if (of) { - of->cfe = cfe; - return 0; - } - - if (css->ss) - css_put(css); - return -ENOMEM; -} - -static int cgroup_file_release(struct inode *inode, struct file *file) -{ - struct cfent *cfe = __d_cfe(file->f_dentry); - struct cgroup_subsys_state *css = cfe->css; - - if (css->ss) - css_put(css); - return seq_release_private(inode, file); -} +static struct kernfs_ops cgroup_kf_ops = { + .atomic_write_len = PAGE_SIZE, + .write = cgroup_file_write, + .seq_start = cgroup_seqfile_start, + .seq_next = cgroup_seqfile_next, + .seq_stop = cgroup_seqfile_stop, + .seq_show = cgroup_seqfile_show, +}; /* * cgroup_rename - Only allow simple rename of directories in place. */ -static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) +static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, + const char *new_name_str) { + struct cgroup *cgrp = kn->priv; int ret; - struct cgroup_name *name, *old_name; - struct cgroup *cgrp; - - /* - * It's convinient to use parent dir's i_mutex to protected - * cgrp->name. - */ - lockdep_assert_held(&old_dir->i_mutex); - if (!S_ISDIR(old_dentry->d_inode->i_mode)) + if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; - if (new_dentry->d_inode) - return -EEXIST; - if (old_dir != new_dir) + if (kn->parent != new_parent) return -EIO; - cgrp = __d_cgrp(old_dentry); - /* * This isn't a proper migration and its usefulness is very * limited. Disallow if sane_behavior. @@ -2469,218 +2155,29 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, if (cgroup_sane_behavior(cgrp)) return -EPERM; - name = cgroup_alloc_name(new_dentry); - if (!name) - return -ENOMEM; - - ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); - if (ret) { - kfree(name); - return ret; - } - - old_name = rcu_dereference_protected(cgrp->name, true); - rcu_assign_pointer(cgrp->name, name); - - kfree_rcu(old_name, rcu_head); - return 0; -} - -static struct simple_xattrs *__d_xattrs(struct dentry *dentry) -{ - if (S_ISDIR(dentry->d_inode->i_mode)) - return &__d_cgrp(dentry)->xattrs; - else - return &__d_cfe(dentry)->xattrs; -} - -static inline int xattr_enabled(struct dentry *dentry) -{ - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - return root->flags & CGRP_ROOT_XATTR; -} - -static bool is_valid_xattr(const char *name) -{ - if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || - !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) - return true; - return false; -} - -static int cgroup_setxattr(struct dentry *dentry, const char *name, - const void *val, size_t size, int flags) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags); -} - -static int cgroup_removexattr(struct dentry *dentry, const char *name) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_remove(__d_xattrs(dentry), name); -} - -static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name, - void *buf, size_t size) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - if (!is_valid_xattr(name)) - return -EINVAL; - return simple_xattr_get(__d_xattrs(dentry), name, buf, size); -} - -static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) -{ - if (!xattr_enabled(dentry)) - return -EOPNOTSUPP; - return simple_xattr_list(__d_xattrs(dentry), buf, size); -} - -static const struct file_operations cgroup_file_operations = { - .read = seq_read, - .write = cgroup_file_write, - .llseek = generic_file_llseek, - .open = cgroup_file_open, - .release = cgroup_file_release, -}; - -static const struct inode_operations cgroup_file_inode_operations = { - .setxattr = cgroup_setxattr, - .getxattr = cgroup_getxattr, - .listxattr = cgroup_listxattr, - .removexattr = cgroup_removexattr, -}; - -static const struct inode_operations cgroup_dir_inode_operations = { - .lookup = simple_lookup, - .mkdir = cgroup_mkdir, - .rmdir = cgroup_rmdir, - .rename = cgroup_rename, - .setxattr = cgroup_setxattr, - .getxattr = cgroup_getxattr, - .listxattr = cgroup_listxattr, - .removexattr = cgroup_removexattr, -}; - -static int cgroup_create_file(struct dentry *dentry, umode_t mode, - struct super_block *sb) -{ - struct inode *inode; - - if (!dentry) - return -ENOENT; - if (dentry->d_inode) - return -EEXIST; - - inode = cgroup_new_inode(mode, sb); - if (!inode) - return -ENOMEM; - - if (S_ISDIR(mode)) { - inode->i_op = &cgroup_dir_inode_operations; - inode->i_fop = &simple_dir_operations; - - /* start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - inc_nlink(dentry->d_parent->d_inode); - - /* - * Control reaches here with cgroup_mutex held. - * @inode->i_mutex should nest outside cgroup_mutex but we - * want to populate it immediately without releasing - * cgroup_mutex. As @inode isn't visible to anyone else - * yet, trylock will always succeed without affecting - * lockdep checks. - */ - WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); - } else if (S_ISREG(mode)) { - inode->i_size = 0; - inode->i_fop = &cgroup_file_operations; - inode->i_op = &cgroup_file_inode_operations; - } - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ - return 0; -} - -/** - * cgroup_file_mode - deduce file mode of a control file - * @cft: the control file in question - * - * returns cft->mode if ->mode is not 0 - * returns S_IRUGO|S_IWUSR if it has both a read and a write handler - * returns S_IRUGO if it has only a read handler - * returns S_IWUSR if it has only a write hander - */ -static umode_t cgroup_file_mode(const struct cftype *cft) -{ - umode_t mode = 0; - - if (cft->mode) - return cft->mode; - - if (cft->read_u64 || cft->read_s64 || cft->seq_show) - mode |= S_IRUGO; + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); - if (cft->write_u64 || cft->write_s64 || cft->write_string || - cft->trigger) - mode |= S_IWUSR; + ret = kernfs_rename(kn, new_parent, new_name_str); - return mode; + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); + return ret; } static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) { - struct dentry *dir = cgrp->dentry; - struct cgroup *parent = __d_cgrp(dir); - struct dentry *dentry; - struct cfent *cfe; - int error; - umode_t mode; - char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - - if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && - !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { - strcpy(name, cft->ss->name); - strcat(name, "."); - } - strcat(name, cft->name); + char name[CGROUP_FILE_NAME_MAX]; + struct kernfs_node *kn; + struct lock_class_key *key = NULL; - BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); - - cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); - if (!cfe) - return -ENOMEM; - - dentry = lookup_one_len(name, dir, strlen(name)); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out; - } - - cfe->type = (void *)cft; - cfe->dentry = dentry; - dentry->d_fsdata = cfe; - simple_xattrs_init(&cfe->xattrs); - - mode = cgroup_file_mode(cft); - error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); - if (!error) { - list_add_tail(&cfe->node, &parent->files); - cfe = NULL; - } - dput(dentry); -out: - kfree(cfe); - return error; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + key = &cft->lockdep_key; +#endif + kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name), + cgroup_file_mode(cft), 0, cft->kf_ops, cft, + NULL, false, key); + return PTR_ERR_OR_ZERO(kn); } /** @@ -2700,8 +2197,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], struct cftype *cft; int ret; - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); - lockdep_assert_held(&cgroup_mutex); + lockdep_assert_held(&cgroup_tree_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ @@ -2726,44 +2222,19 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], return 0; } -static void cgroup_cfts_prepare(void) - __acquires(&cgroup_mutex) -{ - /* - * Thanks to the entanglement with vfs inode locking, we can't walk - * the existing cgroups under cgroup_mutex and create files. - * Instead, we use css_for_each_descendant_pre() and drop RCU read - * lock before calling cgroup_addrm_files(). - */ - mutex_lock(&cgroup_mutex); -} - -static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) - __releases(&cgroup_mutex) +static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) { LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *root = &ss->root->top_cgroup; - struct super_block *sb = ss->root->sb; - struct dentry *prev = NULL; - struct inode *inode; struct cgroup_subsys_state *css; - u64 update_before; int ret = 0; - /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ - if (!cfts || ss->root == &cgroup_dummy_root || - !atomic_inc_not_zero(&sb->s_active)) { - mutex_unlock(&cgroup_mutex); - return 0; - } + lockdep_assert_held(&cgroup_tree_mutex); - /* - * All cgroups which are created after we drop cgroup_mutex will - * have the updated set of files, so we only need to update the - * cgroups created before the current @cgroup_serial_nr_next. - */ - update_before = cgroup_serial_nr_next; + /* don't bother if @ss isn't attached */ + if (ss->root == &cgroup_dummy_root) + return 0; /* add/rm files for all cgroups created before */ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { @@ -2772,62 +2243,75 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) if (cgroup_is_dead(cgrp)) continue; - inode = cgrp->dentry->d_inode; - dget(cgrp->dentry); - dput(prev); - prev = cgrp->dentry; - - mutex_unlock(&cgroup_mutex); - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_mutex); - if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) - ret = cgroup_addrm_files(cgrp, cfts, is_add); - mutex_unlock(&inode->i_mutex); + ret = cgroup_addrm_files(cgrp, cfts, is_add); if (ret) break; } - mutex_unlock(&cgroup_mutex); - dput(prev); - deactivate_super(sb); + + if (is_add && !ret) + kernfs_activate(root->kn); return ret; } -/** - * cgroup_add_cftypes - add an array of cftypes to a subsystem - * @ss: target cgroup subsystem - * @cfts: zero-length name terminated array of cftypes - * - * Register @cfts to @ss. Files described by @cfts are created for all - * existing cgroups to which @ss is attached and all future cgroups will - * have them too. This function can be called anytime whether @ss is - * attached or not. - * - * Returns 0 on successful registration, -errno on failure. Note that this - * function currently returns 0 as long as @cfts registration is successful - * even if some file creation attempts on existing cgroups fail. - */ -int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +static void cgroup_exit_cftypes(struct cftype *cfts) { - struct cftype_set *set; struct cftype *cft; - int ret; - set = kzalloc(sizeof(*set), GFP_KERNEL); - if (!set) - return -ENOMEM; + for (cft = cfts; cft->name[0] != '\0'; cft++) { + /* free copy for custom atomic_write_len, see init_cftypes() */ + if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) + kfree(cft->kf_ops); + cft->kf_ops = NULL; + cft->ss = NULL; + } +} + +static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + struct cftype *cft; - for (cft = cfts; cft->name[0] != '\0'; cft++) + for (cft = cfts; cft->name[0] != '\0'; cft++) { + struct kernfs_ops *kf_ops; + + WARN_ON(cft->ss || cft->kf_ops); + + if (cft->seq_start) + kf_ops = &cgroup_kf_ops; + else + kf_ops = &cgroup_kf_single_ops; + + /* + * Ugh... if @cft wants a custom max_write_len, we need to + * make a copy of kf_ops to set its atomic_write_len. + */ + if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { + kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); + if (!kf_ops) { + cgroup_exit_cftypes(cfts); + return -ENOMEM; + } + kf_ops->atomic_write_len = cft->max_write_len; + } + + cft->kf_ops = kf_ops; cft->ss = ss; + } - cgroup_cfts_prepare(); - set->cfts = cfts; - list_add_tail(&set->node, &ss->cftsets); - ret = cgroup_cfts_commit(cfts, true); - if (ret) - cgroup_rm_cftypes(cfts); - return ret; + return 0; +} + +static int cgroup_rm_cftypes_locked(struct cftype *cfts) +{ + lockdep_assert_held(&cgroup_tree_mutex); + + if (!cfts || !cfts[0].ss) + return -ENOENT; + + list_del(&cfts->node); + cgroup_apply_cftypes(cfts, false); + cgroup_exit_cftypes(cfts); + return 0; } -EXPORT_SYMBOL_GPL(cgroup_add_cftypes); /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem @@ -2842,24 +2326,48 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes); */ int cgroup_rm_cftypes(struct cftype *cfts) { - struct cftype_set *set; + int ret; - if (!cfts || !cfts[0].ss) - return -ENOENT; + mutex_lock(&cgroup_tree_mutex); + ret = cgroup_rm_cftypes_locked(cfts); + mutex_unlock(&cgroup_tree_mutex); + return ret; +} - cgroup_cfts_prepare(); +/** + * cgroup_add_cftypes - add an array of cftypes to a subsystem + * @ss: target cgroup subsystem + * @cfts: zero-length name terminated array of cftypes + * + * Register @cfts to @ss. Files described by @cfts are created for all + * existing cgroups to which @ss is attached and all future cgroups will + * have them too. This function can be called anytime whether @ss is + * attached or not. + * + * Returns 0 on successful registration, -errno on failure. Note that this + * function currently returns 0 as long as @cfts registration is successful + * even if some file creation attempts on existing cgroups fail. + */ +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +{ + int ret; - list_for_each_entry(set, &cfts[0].ss->cftsets, node) { - if (set->cfts == cfts) { - list_del(&set->node); - kfree(set); - cgroup_cfts_commit(cfts, false); - return 0; - } - } + if (!cfts || cfts[0].name[0] == '\0') + return 0; + + ret = cgroup_init_cftypes(ss, cfts); + if (ret) + return ret; + + mutex_lock(&cgroup_tree_mutex); - cgroup_cfts_commit(NULL, false); - return -ENOENT; + list_add_tail(&cfts->node, &ss->cfts); + ret = cgroup_apply_cftypes(cfts, true); + if (ret) + cgroup_rm_cftypes_locked(cfts); + + mutex_unlock(&cgroup_tree_mutex); + return ret; } /** @@ -2868,57 +2376,18 @@ int cgroup_rm_cftypes(struct cftype *cfts) * * Return the number of tasks in the cgroup. */ -int cgroup_task_count(const struct cgroup *cgrp) +static int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; struct cgrp_cset_link *link; - read_lock(&css_set_lock); + down_read(&css_set_rwsem); list_for_each_entry(link, &cgrp->cset_links, cset_link) count += atomic_read(&link->cset->refcount); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); return count; } -/* - * To reduce the fork() overhead for systems that are not actually using - * their cgroups capability, we don't maintain the lists running through - * each css_set to its tasks until we see the list actually used - in other - * words after the first call to css_task_iter_start(). - */ -static void cgroup_enable_task_cg_lists(void) -{ - struct task_struct *p, *g; - write_lock(&css_set_lock); - use_task_css_set_links = 1; - /* - * We need tasklist_lock because RCU is not safe against - * while_each_thread(). Besides, a forking task that has passed - * cgroup_post_fork() without seeing use_task_css_set_links = 1 - * is not guaranteed to have its child immediately visible in the - * tasklist if we walk through it with RCU. - */ - read_lock(&tasklist_lock); - do_each_thread(g, p) { - task_lock(p); - /* - * We should check if the process is exiting, otherwise - * it will race with cgroup_exit() in that the list - * entry won't be deleted though the process has exited. - * Do it while holding siglock so that we don't end up - * racing against cgroup_exit(). - */ - spin_lock_irq(&p->sighand->siglock); - if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) - list_add(&p->cg_list, &task_css_set(p)->tasks); - spin_unlock_irq(&p->sighand->siglock); - - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - write_unlock(&css_set_lock); -} - /** * css_next_child - find the next child of a given css * @pos_css: the current position (%NULL to initiate traversal) @@ -2937,7 +2406,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; - cgroup_assert_mutex_or_rcu_locked(); + cgroup_assert_mutexes_or_rcu_locked(); /* * @pos could already have been removed. Once a cgroup is removed, @@ -2973,7 +2442,6 @@ css_next_child(struct cgroup_subsys_state *pos_css, return cgroup_css(next, parent_css->ss); } -EXPORT_SYMBOL_GPL(css_next_child); /** * css_next_descendant_pre - find the next descendant for pre-order walk @@ -2995,7 +2463,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutex_or_rcu_locked(); + cgroup_assert_mutexes_or_rcu_locked(); /* if first iteration, visit @root */ if (!pos) @@ -3016,7 +2484,6 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, return NULL; } -EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** * css_rightmost_descendant - return the rightmost descendant of a css @@ -3036,7 +2503,7 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) { struct cgroup_subsys_state *last, *tmp; - cgroup_assert_mutex_or_rcu_locked(); + cgroup_assert_mutexes_or_rcu_locked(); do { last = pos; @@ -3048,7 +2515,6 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos) return last; } -EXPORT_SYMBOL_GPL(css_rightmost_descendant); static struct cgroup_subsys_state * css_leftmost_descendant(struct cgroup_subsys_state *pos) @@ -3084,7 +2550,7 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, { struct cgroup_subsys_state *next; - cgroup_assert_mutex_or_rcu_locked(); + cgroup_assert_mutexes_or_rcu_locked(); /* if first iteration, visit leftmost descendant which may be @root */ if (!pos) @@ -3102,7 +2568,6 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, /* no sibling left, visit parent */ return css_parent(pos); } -EXPORT_SYMBOL_GPL(css_next_descendant_post); /** * css_advance_task_iter - advance a task itererator to the next css_set @@ -3146,17 +2611,12 @@ static void css_advance_task_iter(struct css_task_iter *it) */ void css_task_iter_start(struct cgroup_subsys_state *css, struct css_task_iter *it) - __acquires(css_set_lock) + __acquires(css_set_rwsem) { - /* - * The first time anyone tries to iterate across a css, we need to - * enable the list linking each css_set to its tasks, and fix up - * all existing tasks. - */ - if (!use_task_css_set_links) - cgroup_enable_task_cg_lists(); + /* no one should try to iterate before mounting cgroups */ + WARN_ON_ONCE(!use_task_css_set_links); - read_lock(&css_set_lock); + down_read(&css_set_rwsem); it->origin_css = css; it->cset_link = &css->cgroup->cset_links; @@ -3204,180 +2664,9 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) * Finish task iteration started by css_task_iter_start(). */ void css_task_iter_end(struct css_task_iter *it) - __releases(css_set_lock) -{ - read_unlock(&css_set_lock); -} - -static inline int started_after_time(struct task_struct *t1, - struct timespec *time, - struct task_struct *t2) -{ - int start_diff = timespec_compare(&t1->start_time, time); - if (start_diff > 0) { - return 1; - } else if (start_diff < 0) { - return 0; - } else { - /* - * Arbitrarily, if two processes started at the same - * time, we'll say that the lower pointer value - * started first. Note that t2 may have exited by now - * so this may not be a valid pointer any longer, but - * that's fine - it still serves to distinguish - * between two tasks started (effectively) simultaneously. - */ - return t1 > t2; - } -} - -/* - * This function is a callback from heap_insert() and is used to order - * the heap. - * In this case we order the heap in descending task start time. - */ -static inline int started_after(void *p1, void *p2) -{ - struct task_struct *t1 = p1; - struct task_struct *t2 = p2; - return started_after_time(t1, &t2->start_time, t2); -} - -/** - * css_scan_tasks - iterate though all the tasks in a css - * @css: the css to iterate tasks of - * @test: optional test callback - * @process: process callback - * @data: data passed to @test and @process - * @heap: optional pre-allocated heap used for task iteration - * - * Iterate through all the tasks in @css, calling @test for each, and if it - * returns %true, call @process for it also. - * - * @test may be NULL, meaning always true (select all tasks), which - * effectively duplicates css_task_iter_{start,next,end}() but does not - * lock css_set_lock for the call to @process. - * - * It is guaranteed that @process will act on every task that is a member - * of @css for the duration of this call. This function may or may not - * call @process for tasks that exit or move to a different css during the - * call, or are forked or move into the css during the call. - * - * Note that @test may be called with locks held, and may in some - * situations be called multiple times for the same task, so it should be - * cheap. - * - * If @heap is non-NULL, a heap has been pre-allocated and will be used for - * heap operations (and its "gt" member will be overwritten), else a - * temporary heap will be used (allocation of which may cause this function - * to fail). - */ -int css_scan_tasks(struct cgroup_subsys_state *css, - bool (*test)(struct task_struct *, void *), - void (*process)(struct task_struct *, void *), - void *data, struct ptr_heap *heap) -{ - int retval, i; - struct css_task_iter it; - struct task_struct *p, *dropped; - /* Never dereference latest_task, since it's not refcounted */ - struct task_struct *latest_task = NULL; - struct ptr_heap tmp_heap; - struct timespec latest_time = { 0, 0 }; - - if (heap) { - /* The caller supplied our heap and pre-allocated its memory */ - heap->gt = &started_after; - } else { - /* We need to allocate our own heap memory */ - heap = &tmp_heap; - retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); - if (retval) - /* cannot allocate the heap */ - return retval; - } - - again: - /* - * Scan tasks in the css, using the @test callback to determine - * which are of interest, and invoking @process callback on the - * ones which need an update. Since we don't want to hold any - * locks during the task updates, gather tasks to be processed in a - * heap structure. The heap is sorted by descending task start - * time. If the statically-sized heap fills up, we overflow tasks - * that started later, and in future iterations only consider tasks - * that started after the latest task in the previous pass. This - * guarantees forward progress and that we don't miss any tasks. - */ - heap->size = 0; - css_task_iter_start(css, &it); - while ((p = css_task_iter_next(&it))) { - /* - * Only affect tasks that qualify per the caller's callback, - * if he provided one - */ - if (test && !test(p, data)) - continue; - /* - * Only process tasks that started after the last task - * we processed - */ - if (!started_after_time(p, &latest_time, latest_task)) - continue; - dropped = heap_insert(heap, p); - if (dropped == NULL) { - /* - * The new task was inserted; the heap wasn't - * previously full - */ - get_task_struct(p); - } else if (dropped != p) { - /* - * The new task was inserted, and pushed out a - * different task - */ - get_task_struct(p); - put_task_struct(dropped); - } - /* - * Else the new task was newer than anything already in - * the heap and wasn't inserted - */ - } - css_task_iter_end(&it); - - if (heap->size) { - for (i = 0; i < heap->size; i++) { - struct task_struct *q = heap->ptrs[i]; - if (i == 0) { - latest_time = q->start_time; - latest_task = q; - } - /* Process the task per the caller's callback */ - process(q, data); - put_task_struct(q); - } - /* - * If we had to process any tasks at all, scan again - * in case some of them were in the middle of forking - * children that didn't get processed. - * Not the most efficient way to do it, but it avoids - * having to take callback_mutex in the fork path - */ - goto again; - } - if (heap == &tmp_heap) - heap_free(&tmp_heap); - return 0; -} - -static void cgroup_transfer_one_task(struct task_struct *task, void *data) + __releases(css_set_rwsem) { - struct cgroup *new_cgroup = data; - - mutex_lock(&cgroup_mutex); - cgroup_attach_task(new_cgroup, task, false); - mutex_unlock(&cgroup_mutex); + up_read(&css_set_rwsem); } /** @@ -3387,8 +2676,26 @@ static void cgroup_transfer_one_task(struct task_struct *task, void *data) */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task, - to, NULL); + struct css_task_iter it; + struct task_struct *task; + int ret = 0; + + do { + css_task_iter_start(&from->dummy_css, &it); + task = css_task_iter_next(&it); + if (task) + get_task_struct(task); + css_task_iter_end(&it); + + if (task) { + mutex_lock(&cgroup_mutex); + ret = cgroup_attach_task(to, task, false); + mutex_unlock(&cgroup_mutex); + put_task_struct(task); + } + } while (task && !ret); + + return ret; } /* @@ -3687,21 +2994,31 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { - int ret = -EINVAL; + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); struct cgroup *cgrp; struct css_task_iter it; struct task_struct *tsk; + /* it should be kernfs_node belonging to cgroupfs and is a directory */ + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) + return -EINVAL; + + mutex_lock(&cgroup_mutex); + /* - * Validate dentry by checking the superblock operations, - * and make sure it's a directory. + * We aren't being called from kernfs and there's no guarantee on + * @kn->priv's validity. For this and css_tryget_from_dir(), + * @kn->priv is RCU safe. Let's do the RCU dancing. */ - if (dentry->d_sb->s_op != &cgroup_ops || - !S_ISDIR(dentry->d_inode->i_mode)) - goto err; - - ret = 0; - cgrp = dentry->d_fsdata; + rcu_read_lock(); + cgrp = rcu_dereference(kn->priv); + if (!cgrp || cgroup_is_dead(cgrp)) { + rcu_read_unlock(); + mutex_unlock(&cgroup_mutex); + return -ENOENT; + } + rcu_read_unlock(); css_task_iter_start(&cgrp->dummy_css, &it); while ((tsk = css_task_iter_next(&it))) { @@ -3726,8 +3043,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) } css_task_iter_end(&it); -err: - return ret; + mutex_unlock(&cgroup_mutex); + return 0; } @@ -3745,7 +3062,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_open_file *of = s->private; + struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; @@ -3800,7 +3117,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { - struct cgroup_open_file *of = s->private; + struct kernfs_open_file *of = s->private; struct cgroup_pidlist *l = of->priv; if (l) @@ -3811,7 +3128,7 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v) static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_open_file *of = s->private; + struct kernfs_open_file *of = s->private; struct cgroup_pidlist *l = of->priv; pid_t *p = v; pid_t *end = l->list + l->length; @@ -3861,23 +3178,6 @@ static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, return 0; } -/* - * When dput() is called asynchronously, if umount has been done and - * then deactivate_super() in cgroup_free_fn() kills the superblock, - * there's a small window that vfs will see the root dentry with non-zero - * refcnt and trigger BUG(). - * - * That's why we hold a reference before dput() and drop it right after. - */ -static void cgroup_dput(struct cgroup *cgrp) -{ - struct super_block *sb = cgrp->root->sb; - - atomic_inc(&sb->s_active); - dput(cgrp->dentry); - deactivate_super(sb); -} - static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -3944,7 +3244,7 @@ static struct cftype cgroup_base_files[] = { .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, .seq_show = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, - .max_write_len = PATH_MAX, + .max_write_len = PATH_MAX - 1, }, { } /* terminate */ }; @@ -3963,13 +3263,13 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) /* process cftsets of each subsystem */ for_each_subsys(ss, i) { - struct cftype_set *set; + struct cftype *cfts; if (!test_bit(i, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) { - ret = cgroup_addrm_files(cgrp, set->cfts, true); + list_for_each_entry(cfts, &ss->cfts, node) { + ret = cgroup_addrm_files(cgrp, cfts, true); if (ret < 0) goto err; } @@ -4012,7 +3312,7 @@ static void css_free_work_fn(struct work_struct *work) css_put(css->parent); css->ss->css_free(css); - cgroup_dput(cgrp); + cgroup_put(cgrp); } static void css_free_rcu_fn(struct rcu_head *rcu_head) @@ -4020,10 +3320,6 @@ static void css_free_rcu_fn(struct rcu_head *rcu_head) struct cgroup_subsys_state *css = container_of(rcu_head, struct cgroup_subsys_state, rcu_head); - /* - * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context which we don't have. - */ INIT_WORK(&css->destroy_work, css_free_work_fn); queue_work(cgroup_destroy_wq, &css->destroy_work); } @@ -4033,7 +3329,7 @@ static void css_release(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - rcu_assign_pointer(css->cgroup->subsys[css->ss->subsys_id], NULL); + rcu_assign_pointer(css->cgroup->subsys[css->ss->id], NULL); call_rcu(&css->rcu_head, css_free_rcu_fn); } @@ -4058,6 +3354,7 @@ static int online_css(struct cgroup_subsys_state *css) struct cgroup_subsys *ss = css->ss; int ret = 0; + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (ss->css_online) @@ -4065,7 +3362,7 @@ static int online_css(struct cgroup_subsys_state *css) if (!ret) { css->flags |= CSS_ONLINE; css->cgroup->nr_css++; - rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); + rcu_assign_pointer(css->cgroup->subsys[ss->id], css); } return ret; } @@ -4075,6 +3372,7 @@ static void offline_css(struct cgroup_subsys_state *css) { struct cgroup_subsys *ss = css->ss; + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); if (!(css->flags & CSS_ONLINE)) @@ -4085,7 +3383,7 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; - RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); + RCU_INIT_POINTER(css->cgroup->subsys[ss->id], css); } /** @@ -4103,7 +3401,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) struct cgroup_subsys_state *css; int err; - lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); css = ss->css_alloc(cgroup_css(parent, ss)); @@ -4116,7 +3413,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) init_css(css, ss, cgrp); - err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id); + err = cgroup_populate_dir(cgrp, 1 << ss->id); if (err) goto err_free; @@ -4124,7 +3421,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) if (err) goto err_free; - dget(cgrp->dentry); + cgroup_get(cgrp); css_get(css->parent); if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && @@ -4144,35 +3441,27 @@ err_free: return err; } -/* +/** * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup - * @dentry: dentry of the new cgroup - * @mode: mode to set on new inode - * - * Must be called with the mutex on the parent inode held + * @name: name of the new cgroup + * @mode: mode to set on new cgroup */ -static long cgroup_create(struct cgroup *parent, struct dentry *dentry, - umode_t mode) +static long cgroup_create(struct cgroup *parent, const char *name, + umode_t mode) { struct cgroup *cgrp; - struct cgroup_name *name; struct cgroupfs_root *root = parent->root; int ssid, err; struct cgroup_subsys *ss; - struct super_block *sb = root->sb; + struct kernfs_node *kn; /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) return -ENOMEM; - name = cgroup_alloc_name(dentry); - if (!name) { - err = -ENOMEM; - goto err_free_cgrp; - } - rcu_assign_pointer(cgrp->name, name); + mutex_lock(&cgroup_tree_mutex); /* * Only live parents can have children. Note that the liveliness @@ -4183,7 +3472,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ if (!cgroup_lock_live_group(parent)) { err = -ENODEV; - goto err_free_name; + goto err_unlock_tree; } /* @@ -4196,18 +3485,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_unlock; } - /* Grab a reference on the superblock so the hierarchy doesn't - * get deleted on unmount if there are child cgroups. This - * can be done outside cgroup_mutex, since the sb can't - * disappear while someone has an open control file on the - * fs */ - atomic_inc(&sb->s_active); - init_cgroup_housekeeping(cgrp); - dentry->d_fsdata = cgrp; - cgrp->dentry = dentry; - cgrp->parent = parent; cgrp->dummy_css.parent = &parent->dummy_css; cgrp->root = parent->root; @@ -4218,24 +3497,26 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + err = PTR_ERR(kn); + goto err_free_id; + } + cgrp->kn = kn; + /* - * Create directory. cgroup_create_file() returns with the new - * directory locked on success so that it can be populated without - * dropping cgroup_mutex. + * This extra ref will be put in cgroup_free_fn() and guarantees + * that @cgrp->kn is always accessible. */ - err = cgroup_create_file(dentry, S_IFDIR | mode, sb); - if (err < 0) - goto err_free_id; - lockdep_assert_held(&dentry->d_inode->i_mutex); + kernfs_get(kn); cgrp->serial_nr = cgroup_serial_nr_next++; /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); - root->number_of_cgroups++; - - /* hold a ref to the parent's dentry */ - dget(parent->dentry); + atomic_inc(&root->nr_cgrps); + cgroup_get(parent); /* * @cgrp is now fully operational. If something fails after this @@ -4256,36 +3537,35 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } + kernfs_activate(kn); + mutex_unlock(&cgroup_mutex); - mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + mutex_unlock(&cgroup_tree_mutex); return 0; err_free_id: idr_remove(&root->cgroup_idr, cgrp->id); - /* Release the reference count that we took on the superblock */ - deactivate_super(sb); err_unlock: mutex_unlock(&cgroup_mutex); -err_free_name: - kfree(rcu_dereference_raw(cgrp->name)); -err_free_cgrp: +err_unlock_tree: + mutex_unlock(&cgroup_tree_mutex); kfree(cgrp); return err; err_destroy: cgroup_destroy_locked(cgrp); mutex_unlock(&cgroup_mutex); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&cgroup_tree_mutex); return err; } -static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) { - struct cgroup *c_parent = dentry->d_parent->d_fsdata; + struct cgroup *parent = parent_kn->priv; - /* the vfs holds inode->i_mutex already */ - return cgroup_create(c_parent, dentry, mode | S_IFDIR); + return cgroup_create(parent, name, mode); } /* @@ -4298,6 +3578,7 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); /* @@ -4315,6 +3596,7 @@ static void css_killed_work_fn(struct work_struct *work) cgroup_destroy_css_killed(cgrp); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); /* * Put the css refs from kill_css(). Each css holds an extra @@ -4347,7 +3629,11 @@ static void css_killed_ref_fn(struct percpu_ref *ref) */ static void kill_css(struct cgroup_subsys_state *css) { - cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); + /* + * This must happen before css is disassociated with its cgroup. + * See seq_css() for details. + */ + cgroup_clear_dir(css->cgroup, 1 << css->ss->id); /* * Killing would put the base ref, but we need to keep it alive @@ -4395,22 +3681,21 @@ static void kill_css(struct cgroup_subsys_state *css) static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct dentry *d = cgrp->dentry; - struct cgroup_subsys_state *css; struct cgroup *child; + struct cgroup_subsys_state *css; bool empty; int ssid; - lockdep_assert_held(&d->d_inode->i_mutex); + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* - * css_set_lock synchronizes access to ->cset_links and prevents - * @cgrp from being removed while __put_css_set() is in progress. + * css_set_rwsem synchronizes access to ->cset_links and prevents + * @cgrp from being removed while put_css_set() is in progress. */ - read_lock(&css_set_lock); + down_read(&css_set_rwsem); empty = list_empty(&cgrp->cset_links); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); if (!empty) return -EBUSY; @@ -4433,10 +3718,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Initiate massacre of all css's. cgroup_destroy_css_killed() * will be invoked to perform the rest of destruction once the - * percpu refs of all css's are confirmed to be killed. + * percpu refs of all css's are confirmed to be killed. This + * involves removing the subsystem's files, drop cgroup_mutex. */ + mutex_unlock(&cgroup_mutex); for_each_css(css, ssid, cgrp) kill_css(css); + mutex_lock(&cgroup_mutex); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4462,14 +3750,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (!cgrp->nr_css) cgroup_destroy_css_killed(cgrp); + /* remove @cgrp directory along with the base files */ + mutex_unlock(&cgroup_mutex); + /* - * Clear the base files and remove @cgrp directory. The removal - * puts the base ref but we aren't quite done with @cgrp yet, so - * hold onto it. + * There are two control paths which try to determine cgroup from + * dentry without going through kernfs - cgroupstats_build() and + * css_tryget_from_dir(). Those are supported by RCU protecting + * clearing of cgrp->kn->priv backpointer, which should happen + * after all files under it have been removed. */ - cgroup_addrm_files(cgrp, cgroup_base_files, false); - dget(d); - cgroup_d_remove_dir(d); + kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */ + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + + mutex_lock(&cgroup_mutex); return 0; }; @@ -4486,59 +3780,69 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) static void cgroup_destroy_css_killed(struct cgroup *cgrp) { struct cgroup *parent = cgrp->parent; - struct dentry *d = cgrp->dentry; + lockdep_assert_held(&cgroup_tree_mutex); lockdep_assert_held(&cgroup_mutex); /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); - dput(d); + cgroup_put(cgrp); set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); } -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) +static int cgroup_rmdir(struct kernfs_node *kn) { - int ret; - - mutex_lock(&cgroup_mutex); - ret = cgroup_destroy_locked(dentry->d_fsdata); - mutex_unlock(&cgroup_mutex); + struct cgroup *cgrp = kn->priv; + int ret = 0; - return ret; -} + /* + * This is self-destruction but @kn can't be removed while this + * callback is in progress. Let's break active protection. Once + * the protection is broken, @cgrp can be destroyed at any point. + * Pin it so that it stays accessible. + */ + cgroup_get(cgrp); + kernfs_break_active_protection(kn); -static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) -{ - INIT_LIST_HEAD(&ss->cftsets); + mutex_lock(&cgroup_tree_mutex); + mutex_lock(&cgroup_mutex); /* - * base_cftset is embedded in subsys itself, no need to worry about - * deregistration. + * @cgrp might already have been destroyed while we're trying to + * grab the mutexes. */ - if (ss->base_cftypes) { - struct cftype *cft; + if (!cgroup_is_dead(cgrp)) + ret = cgroup_destroy_locked(cgrp); - for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) - cft->ss = ss; + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); - ss->base_cftset.cfts = ss->base_cftypes; - list_add_tail(&ss->base_cftset.node, &ss->cftsets); - } + kernfs_unbreak_active_protection(kn); + cgroup_put(cgrp); + return ret; } +static struct kernfs_syscall_ops cgroup_kf_syscall_ops = { + .remount_fs = cgroup_remount, + .show_options = cgroup_show_options, + .mkdir = cgroup_mkdir, + .rmdir = cgroup_rmdir, + .rename = cgroup_rename, +}; + static void __init cgroup_init_subsys(struct cgroup_subsys *ss) { struct cgroup_subsys_state *css; printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + mutex_lock(&cgroup_tree_mutex); mutex_lock(&cgroup_mutex); - /* init base cftset */ - cgroup_init_cftsets(ss); + INIT_LIST_HEAD(&ss->cfts); /* Create the top cgroup state for this subsystem */ ss->root = &cgroup_dummy_root; @@ -4551,7 +3855,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * pointer to this state - since the subsystem is * newly registered, all tasks and hence the * init_css_set is in the subsystem's top cgroup. */ - init_css_set.subsys[ss->subsys_id] = css; + init_css_set.subsys[ss->id] = css; need_forkexit_callback |= ss->fork || ss->exit; @@ -4563,184 +3867,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(online_css(css)); mutex_unlock(&cgroup_mutex); - - /* this function shouldn't be used with modular subsystems, since they - * need to register a subsys_id, among other things */ - BUG_ON(ss->module); -} - -/** - * cgroup_load_subsys: load and register a modular subsystem at runtime - * @ss: the subsystem to load - * - * This function should be called in a modular subsystem's initcall. If the - * subsystem is built as a module, it will be assigned a new subsys_id and set - * up for use. If the subsystem is built-in anyway, work is delegated to the - * simpler cgroup_init_subsys. - */ -int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) -{ - struct cgroup_subsys_state *css; - int i, ret; - struct hlist_node *tmp; - struct css_set *cset; - unsigned long key; - - /* check name and function validity */ - if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || - ss->css_alloc == NULL || ss->css_free == NULL) - return -EINVAL; - - /* - * we don't support callbacks in modular subsystems. this check is - * before the ss->module check for consistency; a subsystem that could - * be a module should still have no callbacks even if the user isn't - * compiling it as one. - */ - if (ss->fork || ss->exit) - return -EINVAL; - - /* - * an optionally modular subsystem is built-in: we want to do nothing, - * since cgroup_init_subsys will have already taken care of it. - */ - if (ss->module == NULL) { - /* a sanity check */ - BUG_ON(cgroup_subsys[ss->subsys_id] != ss); - return 0; - } - - /* init base cftset */ - cgroup_init_cftsets(ss); - - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - cgroup_subsys[ss->subsys_id] = ss; - - /* - * no ss->css_alloc seems to need anything important in the ss - * struct, so this can happen first (i.e. before the dummy root - * attachment). - */ - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); - if (IS_ERR(css)) { - /* failure case - need to deassign the cgroup_subsys[] slot. */ - cgroup_subsys[ss->subsys_id] = NULL; - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - return PTR_ERR(css); - } - - ss->root = &cgroup_dummy_root; - - /* our new subsystem will be attached to the dummy hierarchy. */ - init_css(css, ss, cgroup_dummy_top); - - /* - * Now we need to entangle the css into the existing css_sets. unlike - * in cgroup_init_subsys, there are now multiple css_sets, so each one - * will need a new pointer to it; done by iterating the css_set_table. - * furthermore, modifying the existing css_sets will corrupt the hash - * table state, so each changed css_set will need its hash recomputed. - * this is all done under the css_set_lock. - */ - write_lock(&css_set_lock); - hash_for_each_safe(css_set_table, i, tmp, cset, hlist) { - /* skip entries that we already rehashed */ - if (cset->subsys[ss->subsys_id]) - continue; - /* remove existing entry */ - hash_del(&cset->hlist); - /* set new value */ - cset->subsys[ss->subsys_id] = css; - /* recompute hash and restore entry */ - key = css_set_hash(cset->subsys); - hash_add(css_set_table, &cset->hlist, key); - } - write_unlock(&css_set_lock); - - ret = online_css(css); - if (ret) { - ss->css_free(css); - goto err_unload; - } - - /* success! */ - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - return 0; - -err_unload: - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); - /* @ss can't be mounted here as try_module_get() would fail */ - cgroup_unload_subsys(ss); - return ret; -} -EXPORT_SYMBOL_GPL(cgroup_load_subsys); - -/** - * cgroup_unload_subsys: unload a modular subsystem - * @ss: the subsystem to unload - * - * This function should be called in a modular subsystem's exitcall. When this - * function is invoked, the refcount on the subsystem's module will be 0, so - * the subsystem will not be attached to any hierarchy. - */ -void cgroup_unload_subsys(struct cgroup_subsys *ss) -{ - struct cgrp_cset_link *link; - struct cgroup_subsys_state *css; - - BUG_ON(ss->module == NULL); - - /* - * we shouldn't be called if the subsystem is in use, and the use of - * try_module_get() in rebind_subsystems() should ensure that it - * doesn't start being used while we're killing it off. - */ - BUG_ON(ss->root != &cgroup_dummy_root); - - mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); - - css = cgroup_css(cgroup_dummy_top, ss); - if (css) - offline_css(css); - - /* deassign the subsys_id */ - cgroup_subsys[ss->subsys_id] = NULL; - - /* - * disentangle the css from all css_sets attached to the dummy - * top. as in loading, we need to pay our respects to the hashtable - * gods. - */ - write_lock(&css_set_lock); - list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) { - struct css_set *cset = link->cset; - unsigned long key; - - hash_del(&cset->hlist); - cset->subsys[ss->subsys_id] = NULL; - key = css_set_hash(cset->subsys); - hash_add(css_set_table, &cset->hlist, key); - } - write_unlock(&css_set_lock); - - /* - * remove subsystem's css from the cgroup_dummy_top and free it - - * need to free before marking as null because ss->css_free needs - * the cgrp->subsys pointer to find their state. - */ - if (css) - ss->css_free(css); - RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); - - mutex_unlock(&cgroup_root_mutex); - mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgroup_tree_mutex); } -EXPORT_SYMBOL_GPL(cgroup_unload_subsys); /** * cgroup_init_early - cgroup initialization at system boot @@ -4767,17 +3895,16 @@ int __init cgroup_init_early(void) list_add(&init_cgrp_cset_link.cset_link, &cgroup_dummy_top->cset_links); list_add(&init_cgrp_cset_link.cgrp_link, &init_css_set.cgrp_links); - /* at bootup time, we don't worry about modular subsystems */ - for_each_builtin_subsys(ss, i) { - BUG_ON(!ss->name); - BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); - BUG_ON(!ss->css_alloc); - BUG_ON(!ss->css_free); - if (ss->subsys_id != i) { - printk(KERN_ERR "cgroup: Subsys %s id == %d\n", - ss->name, ss->subsys_id); - BUG(); - } + for_each_subsys(ss, i) { + WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, + "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", + i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, + ss->id, ss->name); + WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, + "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]); + + ss->id = i; + ss->name = cgroup_subsys_name[i]; if (ss->early_init) cgroup_init_subsys(ss); @@ -4797,18 +3924,22 @@ int __init cgroup_init(void) unsigned long key; int i, err; - err = bdi_init(&cgroup_backing_dev_info); - if (err) - return err; + BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); - for_each_builtin_subsys(ss, i) { + for_each_subsys(ss, i) { if (!ss->early_init) cgroup_init_subsys(ss); + + /* + * cftype registration needs kmalloc and can't be done + * during early_init. Register base cftypes separately. + */ + if (ss->base_cftypes) + WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); } /* allocate id for the dummy hierarchy */ mutex_lock(&cgroup_mutex); - mutex_lock(&cgroup_root_mutex); /* Add init_css_set to the hash table */ key = css_set_hash(init_css_set.subsys); @@ -4820,28 +3951,20 @@ int __init cgroup_init(void) 0, 1, GFP_KERNEL); BUG_ON(err < 0); - mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); - if (!cgroup_kobj) { - err = -ENOMEM; - goto out; - } + if (!cgroup_kobj) + return -ENOMEM; err = register_filesystem(&cgroup_fs_type); if (err < 0) { kobject_put(cgroup_kobj); - goto out; + return err; } proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); - -out: - if (err) - bdi_destroy(&cgroup_backing_dev_info); - - return err; + return 0; } static int __init cgroup_wq_init(void) @@ -4886,12 +4009,12 @@ int proc_cgroup_show(struct seq_file *m, void *v) { struct pid *pid; struct task_struct *tsk; - char *buf; + char *buf, *path; int retval; struct cgroupfs_root *root; retval = -ENOMEM; - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; @@ -4904,6 +4027,7 @@ int proc_cgroup_show(struct seq_file *m, void *v) retval = 0; mutex_lock(&cgroup_mutex); + down_read(&css_set_rwsem); for_each_active_root(root) { struct cgroup_subsys *ss; @@ -4919,14 +4043,17 @@ int proc_cgroup_show(struct seq_file *m, void *v) root->name); seq_putc(m, ':'); cgrp = task_cgroup_from_root(tsk, root); - retval = cgroup_path(cgrp, buf, PAGE_SIZE); - if (retval < 0) + path = cgroup_path(cgrp, buf, PATH_MAX); + if (!path) { + retval = -ENAMETOOLONG; goto out_unlock; - seq_puts(m, buf); + } + seq_puts(m, path); seq_putc(m, '\n'); } out_unlock: + up_read(&css_set_rwsem); mutex_unlock(&cgroup_mutex); put_task_struct(tsk); out_free: @@ -4952,7 +4079,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) seq_printf(m, "%s\t%d\t%d\t%d\n", ss->name, ss->root->hierarchy_id, - ss->root->number_of_cgroups, !ss->disabled); + atomic_read(&ss->root->nr_cgrps), !ss->disabled); mutex_unlock(&cgroup_mutex); return 0; @@ -5022,12 +4149,12 @@ void cgroup_post_fork(struct task_struct *child) * lock on fork. */ if (use_task_css_set_links) { - write_lock(&css_set_lock); + down_write(&css_set_rwsem); task_lock(child); if (list_empty(&child->cg_list)) list_add(&child->cg_list, &task_css_set(child)->tasks); task_unlock(child); - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); } /* @@ -5036,15 +4163,7 @@ void cgroup_post_fork(struct task_struct *child) * and addition to css_set. */ if (need_forkexit_callback) { - /* - * fork/exit callbacks are supported only for builtin - * subsystems, and the builtin section of the subsys - * array is immutable, so we don't need to lock the - * subsys array here. On the other hand, modular section - * of the array can be freed at module unload, so we - * can't touch that. - */ - for_each_builtin_subsys(ss, i) + for_each_subsys(ss, i) if (ss->fork) ss->fork(child); } @@ -5092,15 +4211,14 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) int i; /* - * Unlink from the css_set task list if necessary. - * Optimistically check cg_list before taking - * css_set_lock + * Unlink from the css_set task list if necessary. Optimistically + * check cg_list before taking css_set_rwsem. */ if (!list_empty(&tsk->cg_list)) { - write_lock(&css_set_lock); + down_write(&css_set_rwsem); if (!list_empty(&tsk->cg_list)) list_del_init(&tsk->cg_list); - write_unlock(&css_set_lock); + up_write(&css_set_rwsem); } /* Reassign the task to the init_css_set. */ @@ -5109,11 +4227,8 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) RCU_INIT_POINTER(tsk->cgroups, &init_css_set); if (run_callbacks && need_forkexit_callback) { - /* - * fork/exit callbacks are supported only for builtin - * subsystems, see cgroup_post_fork() for details. - */ - for_each_builtin_subsys(ss, i) { + /* see cgroup_post_fork() for details */ + for_each_subsys(ss, i) { if (ss->exit) { struct cgroup_subsys_state *old_css = cset->subsys[i]; struct cgroup_subsys_state *css = task_css(tsk, i); @@ -5124,7 +4239,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } task_unlock(tsk); - put_css_set_taskexit(cset); + put_css_set(cset, true); } static void check_for_release(struct cgroup *cgrp) @@ -5181,16 +4296,17 @@ static void cgroup_release_agent(struct work_struct *work) while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; - char *pathbuf = NULL, *agentbuf = NULL; + char *pathbuf = NULL, *agentbuf = NULL, *path; struct cgroup *cgrp = list_entry(release_list.next, struct cgroup, release_list); list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); - pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) goto continue_free; - if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) + path = cgroup_path(cgrp, pathbuf, PATH_MAX); + if (!path) goto continue_free; agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); if (!agentbuf) @@ -5198,7 +4314,7 @@ static void cgroup_release_agent(struct work_struct *work) i = 0; argv[i++] = agentbuf; - argv[i++] = pathbuf; + argv[i++] = path; argv[i] = NULL; i = 0; @@ -5232,11 +4348,7 @@ static int __init cgroup_disable(char *str) if (!*token) continue; - /* - * cgroup_disable, being at boot time, can't know about - * module subsystems, so we don't worry about them. - */ - for_each_builtin_subsys(ss, i) { + for_each_subsys(ss, i) { if (!strcmp(token, ss->name)) { ss->disabled = 1; printk(KERN_INFO "Disabling %s control group" @@ -5250,28 +4362,42 @@ static int __init cgroup_disable(char *str) __setup("cgroup_disable=", cgroup_disable); /** - * css_from_dir - get corresponding css from the dentry of a cgroup dir + * css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir * @dentry: directory dentry of interest * @ss: subsystem of interest * - * Must be called under cgroup_mutex or RCU read lock. The caller is - * responsible for pinning the returned css if it needs to be accessed - * outside the critical section. + * If @dentry is a directory for a cgroup which has @ss enabled on it, try + * to get the corresponding css and return it. If such css doesn't exist + * or can't be pinned, an ERR_PTR value is returned. */ -struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, - struct cgroup_subsys *ss) +struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { + struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; - cgroup_assert_mutex_or_rcu_locked(); - /* is @dentry a cgroup dir? */ - if (!dentry->d_inode || - dentry->d_inode->i_op != &cgroup_dir_inode_operations) + if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || + kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); - cgrp = __d_cgrp(dentry); - return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); + rcu_read_lock(); + + /* + * This path doesn't originate from kernfs and @kn could already + * have been or be removed at any point. @kn->priv is RCU + * protected for this access. See destroy_locked() for details. + */ + cgrp = rcu_dereference(kn->priv); + if (cgrp) + css = cgroup_css(cgrp, ss); + + if (!css || !css_tryget(css)) + css = ERR_PTR(-ENOENT); + + rcu_read_unlock(); + return css; } /** @@ -5286,7 +4412,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) { struct cgroup *cgrp; - cgroup_assert_mutex_or_rcu_locked(); + cgroup_assert_mutexes_or_rcu_locked(); cgrp = idr_find(&ss->root->cgroup_idr, id); if (cgrp) @@ -5338,23 +4464,30 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) { struct cgrp_cset_link *link; struct css_set *cset; + char *name_buf; + + name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name_buf) + return -ENOMEM; - read_lock(&css_set_lock); + down_read(&css_set_rwsem); rcu_read_lock(); cset = rcu_dereference(current->cgroups); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; - const char *name; + const char *name = "?"; + + if (c != cgroup_dummy_top) { + cgroup_name(c, name_buf, NAME_MAX + 1); + name = name_buf; + } - if (c->dentry) - name = c->dentry->d_name.name; - else - name = "?"; seq_printf(seq, "Root %d group %s\n", c->root->hierarchy_id, name); } rcu_read_unlock(); - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); + kfree(name_buf); return 0; } @@ -5364,7 +4497,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) struct cgroup_subsys_state *css = seq_css(seq); struct cgrp_cset_link *link; - read_lock(&css_set_lock); + down_read(&css_set_rwsem); list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; @@ -5380,7 +4513,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v) } } } - read_unlock(&css_set_lock); + up_read(&css_set_rwsem); return 0; } @@ -5423,11 +4556,9 @@ static struct cftype debug_files[] = { { } /* terminate */ }; -struct cgroup_subsys debug_subsys = { - .name = "debug", +struct cgroup_subsys debug_cgrp_subsys = { .css_alloc = debug_css_alloc, .css_free = debug_css_free, - .subsys_id = debug_subsys_id, .base_cftypes = debug_files, }; #endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 6c3154e477f..7201a637c40 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -52,7 +52,7 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) static inline struct freezer *task_freezer(struct task_struct *task) { - return css_freezer(task_css(task, freezer_subsys_id)); + return css_freezer(task_css(task, freezer_cgrp_id)); } static struct freezer *parent_freezer(struct freezer *freezer) @@ -84,8 +84,6 @@ static const char *freezer_state_strs(unsigned int state) return "THAWED"; }; -struct cgroup_subsys freezer_subsys; - static struct cgroup_subsys_state * freezer_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -189,7 +187,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, new_css, tset) { + cgroup_taskset_for_each(task, tset) { if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { @@ -473,13 +471,11 @@ static struct cftype files[] = { { } /* terminate */ }; -struct cgroup_subsys freezer_subsys = { - .name = "freezer", +struct cgroup_subsys freezer_cgrp_subsys = { .css_alloc = freezer_css_alloc, .css_online = freezer_css_online, .css_offline = freezer_css_offline, .css_free = freezer_css_free, - .subsys_id = freezer_subsys_id, .attach = freezer_attach, .fork = freezer_fork, .base_cftypes = files, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4410ac6a55f..d8bec21d7a1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -119,7 +119,7 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { - return css_cs(task_css(task, cpuset_subsys_id)); + return css_cs(task_css(task, cpuset_cgrp_id)); } static inline struct cpuset *parent_cs(struct cpuset *cs) @@ -467,7 +467,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * be changed to have empty cpus_allowed or mems_allowed. */ ret = -ENOSPC; - if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress)) { + if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) { if (!cpumask_empty(cur->cpus_allowed) && cpumask_empty(trial->cpus_allowed)) goto out; @@ -829,55 +829,36 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) } /** - * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's - * @tsk: task to test - * @data: cpuset to @tsk belongs to - * - * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed - * mask needs to be changed. - * - * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cpuset_mutex at this point. - */ -static void cpuset_change_cpumask(struct task_struct *tsk, void *data) -{ - struct cpuset *cs = data; - struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); - - set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); -} - -/** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * - * Called with cpuset_mutex held - * - * The css_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. - * - * No return value. It's guaranteed that css_scan_tasks() always returns 0 - * if @heap != NULL. + * Iterate through each task of @cs updating its cpus_allowed to the + * effective cpuset's. As this function is called with cpuset_mutex held, + * cpuset membership stays stable. */ -static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) +static void update_tasks_cpumask(struct cpuset *cs) { - css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); + struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, &it); + while ((task = css_task_iter_next(&it))) + set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed); + css_task_iter_end(&it); } /* * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. * @root_cs: the root cpuset of the hierarchy * @update_root: update root cpuset or not? - * @heap: the heap used by css_scan_tasks() * * This will update cpumasks of tasks in @root_cs and all other empty cpusets * which take on cpumask of @root_cs. * * Called with cpuset_mutex held */ -static void update_tasks_cpumask_hier(struct cpuset *root_cs, - bool update_root, struct ptr_heap *heap) +static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -898,7 +879,7 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, continue; rcu_read_unlock(); - update_tasks_cpumask(cp, heap); + update_tasks_cpumask(cp); rcu_read_lock(); css_put(&cp->css); @@ -914,7 +895,6 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { - struct ptr_heap heap; int retval; int is_load_balanced; @@ -947,19 +927,13 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) return retval; - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval) - return retval; - is_load_balanced = is_sched_load_balance(trialcs); mutex_lock(&callback_mutex); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); mutex_unlock(&callback_mutex); - update_tasks_cpumask_hier(cs, true, &heap); - - heap_free(&heap); + update_tasks_cpumask_hier(cs, true); if (is_load_balanced) rebuild_sched_domains_locked(); @@ -1052,53 +1026,22 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, task_unlock(tsk); } -struct cpuset_change_nodemask_arg { - struct cpuset *cs; - nodemask_t *newmems; -}; - -/* - * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy - * of it to cpuset's new mems_allowed, and migrate pages to new nodes if - * memory_migrate flag is set. Called with cpuset_mutex held. - */ -static void cpuset_change_nodemask(struct task_struct *p, void *data) -{ - struct cpuset_change_nodemask_arg *arg = data; - struct cpuset *cs = arg->cs; - struct mm_struct *mm; - int migrate; - - cpuset_change_task_nodemask(p, arg->newmems); - - mm = get_task_mm(p); - if (!mm) - return; - - migrate = is_memory_migrate(cs); - - mpol_rebind_mm(mm, &cs->mems_allowed); - if (migrate) - cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems); - mmput(mm); -} - static void *cpuset_being_rebound; /** * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * - * Called with cpuset_mutex held. No return value. It's guaranteed that - * css_scan_tasks() always returns 0 if @heap != NULL. + * Iterate through each task of @cs updating its mems_allowed to the + * effective cpuset's. As this function is called with cpuset_mutex held, + * cpuset membership stays stable. */ -static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) +static void update_tasks_nodemask(struct cpuset *cs) { static nodemask_t newmems; /* protected by cpuset_mutex */ struct cpuset *mems_cs = effective_nodemask_cpuset(cs); - struct cpuset_change_nodemask_arg arg = { .cs = cs, - .newmems = &newmems }; + struct css_task_iter it; + struct task_struct *task; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ @@ -1114,7 +1057,25 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); + css_task_iter_start(&cs->css, &it); + while ((task = css_task_iter_next(&it))) { + struct mm_struct *mm; + bool migrate; + + cpuset_change_task_nodemask(task, &newmems); + + mm = get_task_mm(task); + if (!mm) + continue; + + migrate = is_memory_migrate(cs); + + mpol_rebind_mm(mm, &cs->mems_allowed); + if (migrate) + cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); + mmput(mm); + } + css_task_iter_end(&it); /* * All the tasks' nodemasks have been updated, update @@ -1130,15 +1091,13 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. * @cs: the root cpuset of the hierarchy * @update_root: update the root cpuset or not? - * @heap: the heap used by css_scan_tasks() * * This will update nodemasks of tasks in @root_cs and all other empty cpusets * which take on nodemask of @root_cs. * * Called with cpuset_mutex held */ -static void update_tasks_nodemask_hier(struct cpuset *root_cs, - bool update_root, struct ptr_heap *heap) +static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -1159,7 +1118,7 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, continue; rcu_read_unlock(); - update_tasks_nodemask(cp, heap); + update_tasks_nodemask(cp); rcu_read_lock(); css_put(&cp->css); @@ -1184,7 +1143,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { int retval; - struct ptr_heap heap; /* * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; @@ -1223,17 +1181,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) goto done; - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval < 0) - goto done; - mutex_lock(&callback_mutex); cs->mems_allowed = trialcs->mems_allowed; mutex_unlock(&callback_mutex); - update_tasks_nodemask_hier(cs, true, &heap); - - heap_free(&heap); + update_tasks_nodemask_hier(cs, true); done: return retval; } @@ -1261,38 +1213,22 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) } /** - * cpuset_change_flag - make a task's spread flags the same as its cpuset's - * @tsk: task to be updated - * @data: cpuset to @tsk belongs to - * - * Called by css_scan_tasks() for each task in a cgroup. - * - * We don't need to re-check for the cgroup/cpuset membership, since we're - * holding cpuset_mutex at this point. - */ -static void cpuset_change_flag(struct task_struct *tsk, void *data) -{ - struct cpuset *cs = data; - - cpuset_update_task_spread_flag(cs, tsk); -} - -/** * update_tasks_flags - update the spread flags of tasks in the cpuset. * @cs: the cpuset in which each task's spread flags needs to be changed - * @heap: if NULL, defer allocating heap memory to css_scan_tasks() - * - * Called with cpuset_mutex held - * - * The css_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. * - * No return value. It's guaranteed that css_scan_tasks() always returns 0 - * if @heap != NULL. + * Iterate through each task of @cs updating its spread flags. As this + * function is called with cpuset_mutex held, cpuset membership stays + * stable. */ -static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) +static void update_tasks_flags(struct cpuset *cs) { - css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&cs->css, &it); + while ((task = css_task_iter_next(&it))) + cpuset_update_task_spread_flag(cs, task); + css_task_iter_end(&it); } /* @@ -1310,7 +1246,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, struct cpuset *trialcs; int balance_flag_changed; int spread_flag_changed; - struct ptr_heap heap; int err; trialcs = alloc_trial_cpuset(cs); @@ -1326,10 +1261,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, if (err < 0) goto out; - err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (err < 0) - goto out; - balance_flag_changed = (is_sched_load_balance(cs) != is_sched_load_balance(trialcs)); @@ -1344,8 +1275,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, rebuild_sched_domains_locked(); if (spread_flag_changed) - update_tasks_flags(cs, &heap); - heap_free(&heap); + update_tasks_flags(cs); out: free_trial_cpuset(trialcs); return err; @@ -1449,6 +1379,8 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } +static struct cpuset *cpuset_attach_old_cs; + /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) @@ -1457,6 +1389,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, struct task_struct *task; int ret; + /* used later by cpuset_attach() */ + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset)); + mutex_lock(&cpuset_mutex); /* @@ -1468,7 +1403,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { /* * Kthreads which disallow setaffinity shouldn't be moved * to a new cpuset; we don't want to change their cpu @@ -1520,10 +1455,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css, struct mm_struct *mm; struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); - struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset, - cpuset_subsys_id); struct cpuset *cs = css_cs(css); - struct cpuset *oldcs = css_cs(oldcss); + struct cpuset *oldcs = cpuset_attach_old_cs; struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); struct cpuset *mems_cs = effective_nodemask_cpuset(cs); @@ -1537,7 +1470,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -2024,8 +1957,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) kfree(cs); } -struct cgroup_subsys cpuset_subsys = { - .name = "cpuset", +struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, .css_offline = cpuset_css_offline, @@ -2033,7 +1965,6 @@ struct cgroup_subsys cpuset_subsys = { .can_attach = cpuset_can_attach, .cancel_attach = cpuset_cancel_attach, .attach = cpuset_attach, - .subsys_id = cpuset_subsys_id, .base_cftypes = files, .early_init = 1, }; @@ -2090,10 +2021,9 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) parent = parent_cs(parent); if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { - rcu_read_lock(); - printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n", - cgroup_name(cs->css.cgroup)); - rcu_read_unlock(); + printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset "); + pr_cont_cgroup_name(cs->css.cgroup); + pr_cont("\n"); } } @@ -2141,7 +2071,7 @@ retry: */ if ((sane && cpumask_empty(cs->cpus_allowed)) || (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) - update_tasks_cpumask(cs, NULL); + update_tasks_cpumask(cs); mutex_lock(&callback_mutex); nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); @@ -2155,7 +2085,7 @@ retry: */ if ((sane && nodes_empty(cs->mems_allowed)) || (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) - update_tasks_nodemask(cs, NULL); + update_tasks_nodemask(cs); is_empty = cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed); @@ -2217,7 +2147,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) mutex_lock(&callback_mutex); top_cpuset.mems_allowed = new_mems; mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, NULL); + update_tasks_nodemask(&top_cpuset); } mutex_unlock(&cpuset_mutex); @@ -2621,19 +2551,17 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk) /* Statically allocated to prevent using excess stack. */ static char cpuset_nodelist[CPUSET_NODELIST_LEN]; static DEFINE_SPINLOCK(cpuset_buffer_lock); - struct cgroup *cgrp = task_cs(tsk)->css.cgroup; - rcu_read_lock(); spin_lock(&cpuset_buffer_lock); nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); - printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", - tsk->comm, cgroup_name(cgrp), cpuset_nodelist); + printk(KERN_INFO "%s cpuset=", tsk->comm); + pr_cont_cgroup_name(cgrp); + pr_cont(" mems_allowed=%s\n", cpuset_nodelist); spin_unlock(&cpuset_buffer_lock); - rcu_read_unlock(); } /* @@ -2683,12 +2611,12 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) { struct pid *pid; struct task_struct *tsk; - char *buf; + char *buf, *p; struct cgroup_subsys_state *css; int retval; retval = -ENOMEM; - buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + buf = kmalloc(PATH_MAX, GFP_KERNEL); if (!buf) goto out; @@ -2698,14 +2626,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) if (!tsk) goto out_free; + retval = -ENAMETOOLONG; rcu_read_lock(); - css = task_css(tsk, cpuset_subsys_id); - retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); + css = task_css(tsk, cpuset_cgrp_id); + p = cgroup_path(css->cgroup, buf, PATH_MAX); rcu_read_unlock(); - if (retval < 0) + if (!p) goto out_put_task; - seq_puts(m, buf); + seq_puts(m, p); seq_putc(m, '\n'); + retval = 0; out_put_task: put_task_struct(tsk); out_free: diff --git a/kernel/events/core.c b/kernel/events/core.c index 56003c6edfd..6dd714955b0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -342,7 +342,7 @@ struct perf_cgroup { static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task) { - return container_of(task_css(task, perf_subsys_id), + return container_of(task_css(task, perf_event_cgrp_id), struct perf_cgroup, css); } @@ -370,11 +370,6 @@ perf_cgroup_match(struct perf_event *event) event->cgrp->css.cgroup); } -static inline bool perf_tryget_cgroup(struct perf_event *event) -{ - return css_tryget(&event->cgrp->css); -} - static inline void perf_put_cgroup(struct perf_event *event) { css_put(&event->cgrp->css); @@ -593,9 +588,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; - rcu_read_lock(); - - css = css_from_dir(f.file->f_dentry, &perf_subsys); + css = css_tryget_from_dir(f.file->f_dentry, &perf_event_cgrp_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; @@ -604,13 +597,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, cgrp = container_of(css, struct perf_cgroup, css); event->cgrp = cgrp; - /* must be done before we fput() the file */ - if (!perf_tryget_cgroup(event)) { - event->cgrp = NULL; - ret = -ENOENT; - goto out; - } - /* * all events in a group must monitor * the same cgroup because a task belongs @@ -621,7 +607,6 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, ret = -EINVAL; } out: - rcu_read_unlock(); fdput(f); return ret; } @@ -8036,7 +8021,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) + cgroup_taskset_for_each(task, tset) task_function_call(task, __perf_cgroup_move, task); } @@ -8055,9 +8040,7 @@ static void perf_cgroup_exit(struct cgroup_subsys_state *css, task_function_call(task, __perf_cgroup_move, task); } -struct cgroup_subsys perf_subsys = { - .name = "perf_event", - .subsys_id = perf_subsys_id, +struct cgroup_subsys perf_event_cgrp_subsys = { .css_alloc = perf_cgroup_css_alloc, .css_free = perf_cgroup_css_free, .exit = perf_cgroup_exit, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b46131ef6aa..ba386a06ab1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7176,7 +7176,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, + tg = container_of(task_css_check(tsk, cpu_cgrp_id, lockdep_is_held(&tsk->sighand->siglock)), struct task_group, css); tg = autogroup_task_group(tsk, tg); @@ -7600,7 +7600,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) { + cgroup_taskset_for_each(task, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -7618,7 +7618,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css, tset) + cgroup_taskset_for_each(task, tset) sched_move_task(task); } @@ -7957,8 +7957,7 @@ static struct cftype cpu_files[] = { { } /* terminate */ }; -struct cgroup_subsys cpu_cgroup_subsys = { - .name = "cpu", +struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_free = cpu_cgroup_css_free, .css_online = cpu_cgroup_css_online, @@ -7966,7 +7965,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, - .subsys_id = cpu_cgroup_subsys_id, .base_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 622e0818f90..c143ee380e3 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -41,7 +41,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return css_ca(task_css(tsk, cpuacct_subsys_id)); + return css_ca(task_css(tsk, cpuacct_cgrp_id)); } static inline struct cpuacct *parent_ca(struct cpuacct *ca) @@ -275,11 +275,9 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) rcu_read_unlock(); } -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", +struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, .base_cftypes = files, .early_init = 1, }; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index dd52e7ffb10..30eee3b5293 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -111,8 +111,7 @@ static char *task_group_path(struct task_group *tg) if (autogroup_path(tg, group_path, PATH_MAX)) return group_path; - cgroup_path(tg->css.cgroup, group_path, PATH_MAX); - return group_path; + return cgroup_path(tg->css.cgroup, group_path, PATH_MAX); } #endif diff --git a/lib/kobject.c b/lib/kobject.c index cb14aeac4cc..58751bb80a7 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -94,7 +94,7 @@ static int create_dir(struct kobject *kobj) BUG_ON(ops->type >= KOBJ_NS_TYPES); BUG_ON(!kobj_ns_type_registered(ops->type)); - kernfs_enable_ns(kobj->sd); + sysfs_enable_ns(kobj->sd); } return 0; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index cb00829bb46..b135853e68f 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -30,7 +30,6 @@ struct hugetlb_cgroup { #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) -struct cgroup_subsys hugetlb_subsys __read_mostly; static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static inline @@ -42,7 +41,7 @@ struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) static inline struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) { - return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id)); + return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); } static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) @@ -358,7 +357,7 @@ static void __init __hugetlb_cgroup_file_init(int idx) cft = &h->cgroup_files[4]; memset(cft, 0, sizeof(*cft)); - WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); + WARN_ON(cgroup_add_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files)); return; } @@ -402,10 +401,8 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) return; } -struct cgroup_subsys hugetlb_subsys = { - .name = "hugetlb", +struct cgroup_subsys hugetlb_cgrp_subsys = { .css_alloc = hugetlb_cgroup_css_alloc, .css_offline = hugetlb_cgroup_css_offline, .css_free = hugetlb_cgroup_css_free, - .subsys_id = hugetlb_subsys_id, }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 53385cd4e6f..d9c6ac1532e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -66,8 +66,8 @@ #include <trace/events/vmscan.h> -struct cgroup_subsys mem_cgroup_subsys __read_mostly; -EXPORT_SYMBOL(mem_cgroup_subsys); +struct cgroup_subsys memory_cgrp_subsys __read_mostly; +EXPORT_SYMBOL(memory_cgrp_subsys); #define MEM_CGROUP_RECLAIM_RETRIES 5 static struct mem_cgroup *root_mem_cgroup __read_mostly; @@ -538,7 +538,7 @@ static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { struct cgroup_subsys_state *css; - css = css_from_id(id - 1, &mem_cgroup_subsys); + css = css_from_id(id - 1, &memory_cgrp_subsys); return mem_cgroup_from_css(css); } @@ -1072,7 +1072,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) if (unlikely(!p)) return NULL; - return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id)); + return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); } struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) @@ -1683,15 +1683,8 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, */ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { - /* - * protects memcg_name and makes sure that parallel ooms do not - * interleave - */ + /* oom_info_lock ensures that parallel ooms do not interleave */ static DEFINE_SPINLOCK(oom_info_lock); - struct cgroup *task_cgrp; - struct cgroup *mem_cgrp; - static char memcg_name[PATH_MAX]; - int ret; struct mem_cgroup *iter; unsigned int i; @@ -1701,36 +1694,14 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) spin_lock(&oom_info_lock); rcu_read_lock(); - mem_cgrp = memcg->css.cgroup; - task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); - - ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); - if (ret < 0) { - /* - * Unfortunately, we are unable to convert to a useful name - * But we'll still print out the usage information - */ - rcu_read_unlock(); - goto done; - } - rcu_read_unlock(); - - pr_info("Task in %s killed", memcg_name); + pr_info("Task in "); + pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); + pr_info(" killed as a result of limit of "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_info("\n"); - rcu_read_lock(); - ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); - if (ret < 0) { - rcu_read_unlock(); - goto done; - } rcu_read_unlock(); - /* - * Continues from above, so we don't need an KERN_ level - */ - pr_cont(" as a result of limit of %s\n", memcg_name); -done: - pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, @@ -1745,13 +1716,8 @@ done: res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); for_each_mem_cgroup_tree(iter, memcg) { - pr_info("Memory cgroup stats"); - - rcu_read_lock(); - ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX); - if (!ret) - pr_cont(" for %s", memcg_name); - rcu_read_unlock(); + pr_info("Memory cgroup stats for "); + pr_cont_cgroup_path(iter->css.cgroup); pr_cont(":"); for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { @@ -3401,7 +3367,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *s) { struct kmem_cache *new = NULL; - static char *tmp_name = NULL; + static char *tmp_path = NULL, *tmp_name = NULL; static DEFINE_MUTEX(mutex); /* protects tmp_name */ BUG_ON(!memcg_can_account_kmem(memcg)); @@ -3413,18 +3379,20 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, * This static temporary buffer is used to prevent from * pointless shortliving allocation. */ - if (!tmp_name) { - tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); + if (!tmp_path || !tmp_name) { + if (!tmp_path) + tmp_path = kmalloc(PATH_MAX, GFP_KERNEL); if (!tmp_name) + tmp_name = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!tmp_path || !tmp_name) goto out; } - rcu_read_lock(); - snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, - memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); - rcu_read_unlock(); + cgroup_name(memcg->css.cgroup, tmp_name, NAME_MAX + 1); + snprintf(tmp_path, PATH_MAX, "%s(%d:%s)", s->name, + memcg_cache_id(memcg), tmp_name); - new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, + new = kmem_cache_create_memcg(memcg, tmp_path, s->object_size, s->align, (s->flags & ~SLAB_PANIC), s->ctor, s); if (new) new->allocflags |= __GFP_KMEMCG; @@ -4990,7 +4958,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) struct cgroup *cgrp = memcg->css.cgroup; /* returns EBUSY if there is a task or if we come here twice. */ - if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) + if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children)) return -EBUSY; /* we call try-to-free pages for make this cgroup empty */ @@ -5172,7 +5140,7 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, * of course permitted. */ mutex_lock(&memcg_create_mutex); - if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg)) + if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg)) err = -EBUSY; mutex_unlock(&memcg_create_mutex); if (err) @@ -6183,17 +6151,15 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css, * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css. */ - rcu_read_lock(); - + cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent, + &memory_cgrp_subsys); ret = -EINVAL; - cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, - &mem_cgroup_subsys); - if (cfile_css == css && css_tryget(css)) - ret = 0; - - rcu_read_unlock(); - if (ret) + if (IS_ERR(cfile_css)) + goto out_put_cfile; + if (cfile_css != css) { + css_put(cfile_css); goto out_put_cfile; + } ret = event->register_event(memcg, event->eventfd, buffer); if (ret) @@ -6566,11 +6532,11 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) * unfortunate state in our controller. */ if (parent != root_mem_cgroup) - mem_cgroup_subsys.broken_hierarchy = true; + memory_cgrp_subsys.broken_hierarchy = true; } mutex_unlock(&memcg_create_mutex); - return memcg_init_kmem(memcg, &mem_cgroup_subsys); + return memcg_init_kmem(memcg, &memory_cgrp_subsys); } /* @@ -7264,9 +7230,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) mem_cgroup_from_css(root_css)->use_hierarchy = true; } -struct cgroup_subsys mem_cgroup_subsys = { - .name = "memory", - .subsys_id = mem_cgroup_subsys_id, +struct cgroup_subsys memory_cgrp_subsys = { .css_alloc = mem_cgroup_css_alloc, .css_online = mem_cgroup_css_online, .css_offline = mem_cgroup_css_offline, @@ -7292,7 +7256,7 @@ __setup("swapaccount=", enable_swap_account); static void __init memsw_file_init(void) { - WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, memsw_cgroup_files)); + WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files)); } static void __init enable_swap_cgroup(void) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4f08a2d6148..9b5933c66c1 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -145,14 +145,10 @@ static int hwpoison_filter_task(struct page *p) return -EINVAL; css = mem_cgroup_css(mem); - /* root_mem_cgroup has NULL dentries */ - if (!css->cgroup->dentry) - return -EINVAL; - - ino = css->cgroup->dentry->d_inode->i_ino; + ino = cgroup_ino(css->cgroup); css_put(css); - if (ino != hwpoison_filter_memcg) + if (!ino || ino != hwpoison_filter_memcg) return -EINVAL; return 0; diff --git a/net/Kconfig b/net/Kconfig index e411046a62e..a83bc4cc45a 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -239,7 +239,7 @@ config XPS default y config CGROUP_NET_PRIO - tristate "Network priority cgroup" + bool "Network priority cgroup" depends on CGROUPS ---help--- Cgroup subsystem for use in assigning processes to network priorities on diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c index 719efd54166..22931e1b99b 100644 --- a/net/core/netclassid_cgroup.c +++ b/net/core/netclassid_cgroup.c @@ -23,7 +23,7 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state struct cgroup_cls_state *task_cls_state(struct task_struct *p) { - return css_cls_state(task_css(p, net_cls_subsys_id)); + return css_cls_state(task_css(p, net_cls_cgrp_id)); } EXPORT_SYMBOL_GPL(task_cls_state); @@ -73,7 +73,7 @@ static void cgrp_attach(struct cgroup_subsys_state *css, void *v = (void *)(unsigned long)cs->classid; struct task_struct *p; - cgroup_taskset_for_each(p, css, tset) { + cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_classid, v); task_unlock(p); @@ -102,19 +102,10 @@ static struct cftype ss_files[] = { { } /* terminate */ }; -struct cgroup_subsys net_cls_subsys = { - .name = "net_cls", +struct cgroup_subsys net_cls_cgrp_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = cgrp_attach, - .subsys_id = net_cls_subsys_id, .base_cftypes = ss_files, - .module = THIS_MODULE, }; - -static int __init init_netclassid_cgroup(void) -{ - return cgroup_load_subsys(&net_cls_subsys); -} -__initcall(init_netclassid_cgroup); diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 9043caedcd0..f9f3a40d335 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -224,7 +224,7 @@ static void net_prio_attach(struct cgroup_subsys_state *css, struct task_struct *p; void *v = (void *)(unsigned long)css->cgroup->id; - cgroup_taskset_for_each(p, css, tset) { + cgroup_taskset_for_each(p, tset) { task_lock(p); iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); @@ -244,15 +244,12 @@ static struct cftype ss_files[] = { { } /* terminate */ }; -struct cgroup_subsys net_prio_subsys = { - .name = "net_prio", +struct cgroup_subsys net_prio_cgrp_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = net_prio_attach, - .subsys_id = net_prio_subsys_id, .base_cftypes = ss_files, - .module = THIS_MODULE, }; static int netprio_device_event(struct notifier_block *unused, @@ -283,37 +280,9 @@ static struct notifier_block netprio_device_notifier = { static int __init init_cgroup_netprio(void) { - int ret; - - ret = cgroup_load_subsys(&net_prio_subsys); - if (ret) - goto out; - register_netdevice_notifier(&netprio_device_notifier); - -out: - return ret; -} - -static void __exit exit_cgroup_netprio(void) -{ - struct netprio_map *old; - struct net_device *dev; - - unregister_netdevice_notifier(&netprio_device_notifier); - - cgroup_unload_subsys(&net_prio_subsys); - - rtnl_lock(); - for_each_netdev(&init_net, dev) { - old = rtnl_dereference(dev->priomap); - RCU_INIT_POINTER(dev->priomap, NULL); - if (old) - kfree_rcu(old, rcu); - } - rtnl_unlock(); + return 0; } -module_init(init_cgroup_netprio); -module_exit(exit_cgroup_netprio); +subsys_initcall(init_cgroup_netprio); MODULE_LICENSE("GPL v2"); diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index f7e522c558b..20a0aca9131 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -219,7 +219,7 @@ static struct cftype tcp_files[] = { static int __init tcp_memcontrol_init(void) { - WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, tcp_files)); + WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files)); return 0; } __initcall(tcp_memcontrol_init); diff --git a/security/device_cgroup.c b/security/device_cgroup.c index d3b6d2cd3a0..7f88bcde7c6 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -58,11 +58,9 @@ static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) { - return css_to_devcgroup(task_css(task, devices_subsys_id)); + return css_to_devcgroup(task_css(task, devices_cgrp_id)); } -struct cgroup_subsys devices_subsys; - /* * called under devcgroup_mutex */ @@ -684,13 +682,11 @@ static struct cftype dev_cgroup_files[] = { { } /* terminate */ }; -struct cgroup_subsys devices_subsys = { - .name = "devices", +struct cgroup_subsys devices_cgrp_subsys = { .css_alloc = devcgroup_css_alloc, .css_free = devcgroup_css_free, .css_online = devcgroup_online, .css_offline = devcgroup_offline, - .subsys_id = devices_subsys_id, .base_cftypes = dev_cgroup_files, }; |