diff options
Diffstat (limited to 'Documentation/cgroups')
-rw-r--r-- | Documentation/cgroups/cgroup_event_listener.c | 110 | ||||
-rw-r--r-- | Documentation/cgroups/cgroups.txt | 39 | ||||
-rw-r--r-- | Documentation/cgroups/memcg_test.txt | 47 | ||||
-rw-r--r-- | Documentation/cgroups/memory.txt | 80 |
4 files changed, 268 insertions, 8 deletions
diff --git a/Documentation/cgroups/cgroup_event_listener.c b/Documentation/cgroups/cgroup_event_listener.c new file mode 100644 index 00000000000..8c2bfc4a635 --- /dev/null +++ b/Documentation/cgroups/cgroup_event_listener.c @@ -0,0 +1,110 @@ +/* + * cgroup_event_listener.c - Simple listener of cgroup events + * + * Copyright (C) Kirill A. Shutemov <kirill@shutemov.name> + */ + +#include <assert.h> +#include <errno.h> +#include <fcntl.h> +#include <libgen.h> +#include <limits.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include <sys/eventfd.h> + +#define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>\n" + +int main(int argc, char **argv) +{ + int efd = -1; + int cfd = -1; + int event_control = -1; + char event_control_path[PATH_MAX]; + char line[LINE_MAX]; + int ret; + + if (argc != 3) { + fputs(USAGE_STR, stderr); + return 1; + } + + cfd = open(argv[1], O_RDONLY); + if (cfd == -1) { + fprintf(stderr, "Cannot open %s: %s\n", argv[1], + strerror(errno)); + goto out; + } + + ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control", + dirname(argv[1])); + if (ret >= PATH_MAX) { + fputs("Path to cgroup.event_control is too long\n", stderr); + goto out; + } + + event_control = open(event_control_path, O_WRONLY); + if (event_control == -1) { + fprintf(stderr, "Cannot open %s: %s\n", event_control_path, + strerror(errno)); + goto out; + } + + efd = eventfd(0, 0); + if (efd == -1) { + perror("eventfd() failed"); + goto out; + } + + ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]); + if (ret >= LINE_MAX) { + fputs("Arguments string is too long\n", stderr); + goto out; + } + + ret = write(event_control, line, strlen(line) + 1); + if (ret == -1) { + perror("Cannot write to cgroup.event_control"); + goto out; + } + + while (1) { + uint64_t result; + + ret = read(efd, &result, sizeof(result)); + if (ret == -1) { + if (errno == EINTR) + continue; + perror("Cannot read from eventfd"); + break; + } + assert(ret == sizeof(result)); + + ret = access(event_control_path, W_OK); + if ((ret == -1) && (errno == ENOENT)) { + puts("The cgroup seems to have removed."); + ret = 0; + break; + } + + if (ret == -1) { + perror("cgroup.event_control " + "is not accessable any more"); + break; + } + + printf("%s %s: crossed\n", argv[1], argv[2]); + } + +out: + if (efd >= 0) + close(efd); + if (event_control >= 0) + close(event_control); + if (cfd >= 0) + close(cfd); + + return (ret != 0); +} diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 0b33bfe7dde..fd588ff0e29 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -22,6 +22,8 @@ CONTENTS: 2. Usage Examples and Syntax 2.1 Basic Usage 2.2 Attaching processes + 2.3 Mounting hierarchies by name + 2.4 Notification API 3. Kernel API 3.1 Overview 3.2 Synchronization @@ -434,6 +436,25 @@ you give a subsystem a name. The name of the subsystem appears as part of the hierarchy description in /proc/mounts and /proc/<pid>/cgroups. +2.4 Notification API +-------------------- + +There is mechanism which allows to get notifications about changing +status of a cgroup. + +To register new notification handler you need: + - create a file descriptor for event notification using eventfd(2); + - open a control file to be monitored (e.g. memory.usage_in_bytes); + - write "<event_fd> <control_fd> <args>" to cgroup.event_control. + Interpretation of args is defined by control file implementation; + +eventfd will be woken up by control file implementation or when the +cgroup is removed. + +To unregister notification handler just close eventfd. + +NOTE: Support of notifications should be implemented for the control +file. See documentation for the subsystem. 3. Kernel API ============= @@ -488,6 +509,11 @@ Each subsystem should: - add an entry in linux/cgroup_subsys.h - define a cgroup_subsys object called <name>_subsys +If a subsystem can be compiled as a module, it should also have in its +module initcall a call to cgroup_load_subsys(), and in its exitcall a +call to cgroup_unload_subsys(). It should also set its_subsys.module = +THIS_MODULE in its .c file. + Each subsystem may export the following methods. The only mandatory methods are create/destroy. Any others that are null are presumed to be successful no-ops. @@ -536,10 +562,21 @@ returns an error, this will abort the attach operation. If a NULL task is passed, then a successful result indicates that *any* unspecified task can be moved into the cgroup. Note that this isn't called on a fork. If this method returns 0 (success) then this should -remain valid while the caller holds cgroup_mutex. If threadgroup is +remain valid while the caller holds cgroup_mutex and it is ensured that either +attach() or cancel_attach() will be called in future. If threadgroup is true, then a successful result indicates that all threads in the given thread's threadgroup can be moved together. +void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct task_struct *task, bool threadgroup) +(cgroup_mutex held by caller) + +Called when a task attach operation has failed after can_attach() has succeeded. +A subsystem whose can_attach() has some side-effects should provide this +function, so that the subsytem can implement a rollback. If not, not necessary. +This will be called only about subsystems whose can_attach() operation have +succeeded. + void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *task, bool threadgroup) diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt index 72db89ed060..f7f68b2ac19 100644 --- a/Documentation/cgroups/memcg_test.txt +++ b/Documentation/cgroups/memcg_test.txt @@ -1,6 +1,6 @@ Memory Resource Controller(Memcg) Implementation Memo. -Last Updated: 2009/1/20 -Base Kernel Version: based on 2.6.29-rc2. +Last Updated: 2010/2 +Base Kernel Version: based on 2.6.33-rc7-mm(candidate for 34). Because VM is getting complex (one of reasons is memcg...), memcg's behavior is complex. This is a document for memcg's internal behavior. @@ -337,7 +337,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. race and lock dependency with other cgroup subsystems. example) - # mount -t cgroup none /cgroup -t cpuset,memory,cpu,devices + # mount -t cgroup none /cgroup -o cpuset,memory,cpu,devices and do task move, mkdir, rmdir etc...under this. @@ -348,7 +348,7 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. For example, test like following is good. (Shell-A) - # mount -t cgroup none /cgroup -t memory + # mount -t cgroup none /cgroup -o memory # mkdir /cgroup/test # echo 40M > /cgroup/test/memory.limit_in_bytes # echo 0 > /cgroup/test/tasks @@ -378,3 +378,42 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. #echo 50M > memory.limit_in_bytes #echo 50M > memory.memsw.limit_in_bytes run 51M of malloc + + 9.9 Move charges at task migration + Charges associated with a task can be moved along with task migration. + + (Shell-A) + #mkdir /cgroup/A + #echo $$ >/cgroup/A/tasks + run some programs which uses some amount of memory in /cgroup/A. + + (Shell-B) + #mkdir /cgroup/B + #echo 1 >/cgroup/B/memory.move_charge_at_immigrate + #echo "pid of the program running in group A" >/cgroup/B/tasks + + You can see charges have been moved by reading *.usage_in_bytes or + memory.stat of both A and B. + See 8.2 of Documentation/cgroups/memory.txt to see what value should be + written to move_charge_at_immigrate. + + 9.10 Memory thresholds + Memory controler implements memory thresholds using cgroups notification + API. You can use Documentation/cgroups/cgroup_event_listener.c to test + it. + + (Shell-A) Create cgroup and run event listener + # mkdir /cgroup/A + # ./cgroup_event_listener /cgroup/A/memory.usage_in_bytes 5M + + (Shell-B) Add task to cgroup and try to allocate and free memory + # echo $$ >/cgroup/A/tasks + # a="$(dd if=/dev/zero bs=1M count=10)" + # a= + + You will see message from cgroup_event_listener every time you cross + the thresholds. + + Use /cgroup/A/memory.memsw.usage_in_bytes to test memsw thresholds. + + It's good idea to test root cgroup as well. diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index b871f2552b4..f8bc802d70b 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -182,6 +182,8 @@ list. NOTE: Reclaim does not work for the root cgroup, since we cannot set any limits on the root cgroup. +Note2: When panic_on_oom is set to "2", the whole system will panic. + 2. Locking The memory controller uses the following hierarchy @@ -262,10 +264,12 @@ some of the pages cached in the cgroup (page cache pages). 4.2 Task migration When a task migrates from one cgroup to another, it's charge is not -carried forward. The pages allocated from the original cgroup still +carried forward by default. The pages allocated from the original cgroup still remain charged to it, the charge is dropped when the page is freed or reclaimed. +Note: You can move charges of a task along with task migration. See 8. + 4.3 Removing a cgroup A cgroup can be removed by rmdir, but as discussed in sections 4.1 and 4.2, a @@ -377,7 +381,8 @@ The feature can be disabled by NOTE1: Enabling/disabling will fail if the cgroup already has other cgroups created below it. -NOTE2: This feature can be enabled/disabled per subtree. +NOTE2: When panic_on_oom is set to "2", the whole system will panic in +case of an oom event in any cgroup. 7. Soft limits @@ -414,7 +419,76 @@ NOTE1: Soft limits take effect over a long period of time, since they involve NOTE2: It is recommended to set the soft limit always below the hard limit, otherwise the hard limit will take precedence. -8. TODO +8. Move charges at task migration + +Users can move charges associated with a task along with task migration, that +is, uncharge task's pages from the old cgroup and charge them to the new cgroup. +This feature is not supported in !CONFIG_MMU environments because of lack of +page tables. + +8.1 Interface + +This feature is disabled by default. It can be enabled(and disabled again) by +writing to memory.move_charge_at_immigrate of the destination cgroup. + +If you want to enable it: + +# echo (some positive value) > memory.move_charge_at_immigrate + +Note: Each bits of move_charge_at_immigrate has its own meaning about what type + of charges should be moved. See 8.2 for details. +Note: Charges are moved only when you move mm->owner, IOW, a leader of a thread + group. +Note: If we cannot find enough space for the task in the destination cgroup, we + try to make space by reclaiming memory. Task migration may fail if we + cannot make enough space. +Note: It can take several seconds if you move charges in giga bytes order. + +And if you want disable it again: + +# echo 0 > memory.move_charge_at_immigrate + +8.2 Type of charges which can be move + +Each bits of move_charge_at_immigrate has its own meaning about what type of +charges should be moved. + + bit | what type of charges would be moved ? + -----+------------------------------------------------------------------------ + 0 | A charge of an anonymous page(or swap of it) used by the target task. + | Those pages and swaps must be used only by the target task. You must + | enable Swap Extension(see 2.4) to enable move of swap charges. + +Note: Those pages and swaps must be charged to the old cgroup. +Note: More type of pages(e.g. file cache, shmem,) will be supported by other + bits in future. + +8.3 TODO + +- Add support for other types of pages(e.g. file cache, shmem, etc.). +- Implement madvise(2) to let users decide the vma to be moved or not to be + moved. +- All of moving charge operations are done under cgroup_mutex. It's not good + behavior to hold the mutex too long, so we may need some trick. + +9. Memory thresholds + +Memory controler implements memory thresholds using cgroups notification +API (see cgroups.txt). It allows to register multiple memory and memsw +thresholds and gets notifications when it crosses. + +To register a threshold application need: + - create an eventfd using eventfd(2); + - open memory.usage_in_bytes or memory.memsw.usage_in_bytes; + - write string like "<event_fd> <memory.usage_in_bytes> <threshold>" to + cgroup.event_control. + +Application will be notified through eventfd when memory usage crosses +threshold in any direction. + +It's applicable for root and non-root cgroup. + +10. TODO 1. Add support for accounting huge pages (as a separate controller) 2. Make per-cgroup scanner reclaim not-shared pages first |