From 0e241ffd306c0896bb9959be7faa4d4cfcb706d9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 24 Jul 2008 16:58:42 -0700 Subject: locking: fix mutex @key parameter kernel-doc notation Fix @key parameter to mutex_init() and one of its callers. Warning(linux-2.6.26-git11//drivers/base/class.c:210): No description found for parameter 'key' Signed-off-by: Randy Dunlap Acked-by: Greg Kroah-Hartman Signed-off-by: Ingo Molnar --- kernel/mutex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index bcdc9ac8ef6..12c779dc65d 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -34,6 +34,7 @@ /*** * mutex_init - initialize the mutex * @lock: the mutex to be initialized + * @key: the lock_class_key for the class; used by mutex lock debugging * * Initialize the mutex to unlocked state. * -- cgit v1.2.3-70-g09d2 From 1a4e564b7db999fbe5d88318c96ac8747699d417 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 29 Jul 2008 22:32:57 -0700 Subject: resource: add resource_size() Avoid one-off errors by introducing a resource_size() function. Signed-off-by: Magnus Damm Cc: Ben Dooks Cc: Jean Delvare Cc: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 4 ++++ kernel/resource.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 2cd07cc2968..22d2115458c 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -118,6 +118,10 @@ extern int allocate_resource(struct resource *root, struct resource *new, int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size); resource_size_t resource_alignment(struct resource *res); +static inline resource_size_t resource_size(struct resource *res) +{ + return res->end - res->start + 1; +} /* Convenience shorthand with allocation */ #define request_region(start,n,name) __request_region(&ioport_resource, (start), (n), (name)) diff --git a/kernel/resource.c b/kernel/resource.c index 74af2d7cb5a..f5b518eabef 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -490,7 +490,7 @@ resource_size_t resource_alignment(struct resource *res) { switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { case IORESOURCE_SIZEALIGN: - return res->end - res->start + 1; + return resource_size(res); case IORESOURCE_STARTALIGN: return res->start; default: -- cgit v1.2.3-70-g09d2 From 5a3eb9f6b7c598529f832b8baa6458ab1cbab2c6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:18 -0700 Subject: cgroup: fix possible memory leak There's a leak if copy_from_user() returns failure. Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 657f8f8d93a..28debe4e148 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1424,14 +1424,17 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, if (buffer == NULL) return -ENOMEM; } - if (nbytes && copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; + if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { + retval = -EFAULT; + goto out; + } buffer[nbytes] = 0; /* nul-terminate */ strstrip(buffer); retval = cft->write_string(cgrp, cft, buffer); if (!retval) retval = nbytes; +out: if (buffer != local_buffer) kfree(buffer); return retval; -- cgit v1.2.3-70-g09d2 From 36553434f475a84b653e25e74490ee8df43b86d5 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:19 -0700 Subject: cgroup: remove duplicate code in allocate_cg_link() - just call free_cg_links() in allocate_cg_links() - the list will get initialized in allocate_cg_links(), so don't init it twice Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 28debe4e148..249a517591b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -355,6 +355,17 @@ static struct css_set *find_existing_css_set( return NULL; } +static void free_cg_links(struct list_head *tmp) +{ + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; + + list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { + list_del(&link->cgrp_link_list); + kfree(link); + } +} + /* * allocate_cg_links() allocates "count" cg_cgroup_link structures * and chains them on tmp through their cgrp_link_list fields. Returns 0 on @@ -363,17 +374,12 @@ static struct css_set *find_existing_css_set( static int allocate_cg_links(int count, struct list_head *tmp) { struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; int i; INIT_LIST_HEAD(tmp); for (i = 0; i < count; i++) { link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) { - list_for_each_entry_safe(link, saved_link, tmp, - cgrp_link_list) { - list_del(&link->cgrp_link_list); - kfree(link); - } + free_cg_links(tmp); return -ENOMEM; } list_add(&link->cgrp_link_list, tmp); @@ -381,17 +387,6 @@ static int allocate_cg_links(int count, struct list_head *tmp) return 0; } -static void free_cg_links(struct list_head *tmp) -{ - struct cg_cgroup_link *link; - struct cg_cgroup_link *saved_link; - - list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { - list_del(&link->cgrp_link_list); - kfree(link); - } -} - /* * find_css_set() takes an existing cgroup group and a * cgroup object, and returns a css_set object that's @@ -956,7 +951,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct super_block *sb; struct cgroupfs_root *root; struct list_head tmp_cg_links; - INIT_LIST_HEAD(&tmp_cg_links); /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); -- cgit v1.2.3-70-g09d2 From 55b6fd0162ace1e0f1b52c8c092565c115127ef6 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:20 -0700 Subject: cgroup: uninline cgroup_has_css_refs() It's not small enough, and has 2 call sites. text data bss dec hex filename 12813 1676 4832 19321 4b79 cgroup.o.orig 12775 1676 4832 19283 4b53 cgroup.o Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 249a517591b..13932abde15 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2368,7 +2368,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } -static inline int cgroup_has_css_refs(struct cgroup *cgrp) +static int cgroup_has_css_refs(struct cgroup *cgrp) { /* Check the reference count on each subsystem. Since we * already established that there are no tasks in the -- cgit v1.2.3-70-g09d2 From 8d1e6266f512b3a94ef6d33528ff385f1aea0392 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:21 -0700 Subject: cpuset: a bit cleanup for scan_for_empty_cpusets() clean up hierarchy traversal code Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Paul Jackson Cc: Cliff Wickman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 91cf85b36dd..3624dc0e95e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1833,24 +1833,21 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) */ static void scan_for_empty_cpusets(const struct cpuset *root) { + LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ - struct list_head queue; struct cgroup *cont; nodemask_t oldmems; - INIT_LIST_HEAD(&queue); - list_add_tail((struct list_head *)&root->stack_list, &queue); while (!list_empty(&queue)) { - cp = container_of(queue.next, struct cpuset, stack_list); + cp = list_first_entry(&queue, struct cpuset, stack_list); list_del(queue.next); list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); list_add_tail(&child->stack_list, &queue); } - cont = cp->css.cgroup; /* Continue past cpusets with all cpus, mems online */ if (cpus_subset(cp->cpus_allowed, cpu_online_map) && -- cgit v1.2.3-70-g09d2 From f5393693e96393131a4a2e2743f883986d508503 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 29 Jul 2008 22:33:22 -0700 Subject: cpuset: speed up sched domain partition All child cpusets contain a subset of the parent's cpus, so we can skip them when partitioning sched domains. This decreases 'csa' greately for cpusets with multi-level hierarchy. Signed-off-by: Lai Jiangshan Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Reviewed-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3624dc0e95e..c818c36b0c5 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -486,13 +486,38 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) static void update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) { - if (!dattr) - return; if (dattr->relax_domain_level < c->relax_domain_level) dattr->relax_domain_level = c->relax_domain_level; return; } +static void +update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) +{ + LIST_HEAD(q); + + list_add(&c->stack_list, &q); + while (!list_empty(&q)) { + struct cpuset *cp; + struct cgroup *cont; + struct cpuset *child; + + cp = list_first_entry(&q, struct cpuset, stack_list); + list_del(q.next); + + if (cpus_empty(cp->cpus_allowed)) + continue; + + if (is_sched_load_balance(cp)) + update_domain_attr(dattr, cp); + + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, &q); + } + } +} + /* * rebuild_sched_domains() * @@ -614,8 +639,16 @@ void rebuild_sched_domains(void) if (cpus_empty(cp->cpus_allowed)) continue; - if (is_sched_load_balance(cp)) + /* + * All child cpusets contain a subset of the parent's cpus, so + * just skip them, and then we call update_domain_attr_tree() + * to calc relax_domain_level of the corresponding sched + * domain. + */ + if (is_sched_load_balance(cp)) { csa[csn++] = cp; + continue; + } list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); @@ -686,7 +719,7 @@ restart: cpus_or(*dp, *dp, b->cpus_allowed); b->pn = -1; if (dattr) - update_domain_attr(dattr + update_domain_attr_tree(dattr + nslot, b); } } -- cgit v1.2.3-70-g09d2 From 93a6557558a13f9ff35213efeca483f353c39dd3 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:23 -0700 Subject: cpuset: fix wrong calculation of relax domain level When multiple cpusets are overlapping in their 'cpus' and hence they form a single sched domain, the largest sched_relax_domain_level among those should be used. But when top_cpuset's sched_load_balance is set, its sched_relax_domain_level is used regardless other sub-cpusets'. This patch fixes it by walking the cpuset hierarchy to find the largest sched_relax_domain_level. Signed-off-by: Lai Jiangshan Signed-off-by: Li Zefan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Reviewed-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c818c36b0c5..7a82e9033a7 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -616,7 +616,7 @@ void rebuild_sched_domains(void) dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); if (dattr) { *dattr = SD_ATTR_INIT; - update_domain_attr(dattr, &top_cpuset); + update_domain_attr_tree(dattr, &top_cpuset); } *doms = top_cpuset.cpus_allowed; goto rebuild; -- cgit v1.2.3-70-g09d2 From aeed682421a5ebfbf46940e30c3d1caf3bc64304 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 29 Jul 2008 22:33:24 -0700 Subject: cpuset: clean up cpuset hierarchy traversal code Use cpuset.stack_list rather than kfifo, so we avoid memory allocation for kfifo. Signed-off-by: Li Zefan Signed-off-by: Lai Jiangshan Cc: Paul Menage Cc: Cedric Le Goater Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7a82e9033a7..d5ab79cf516 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include @@ -557,7 +556,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * So the reverse nesting would risk an ABBA deadlock. * * The three key local variables below are: - * q - a kfifo queue of cpuset pointers, used to implement a + * q - a linked-list queue of cpuset pointers, used to implement a * top-down scan of all cpusets. This scan loads a pointer * to each cpuset marked is_sched_load_balance into the * array 'csa'. For our purposes, rebuilding the schedulers @@ -592,7 +591,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) void rebuild_sched_domains(void) { - struct kfifo *q; /* queue of cpusets to be scanned */ + LIST_HEAD(q); /* queue of cpusets to be scanned*/ struct cpuset *cp; /* scans q */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ @@ -602,7 +601,6 @@ void rebuild_sched_domains(void) int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ - q = NULL; csa = NULL; doms = NULL; dattr = NULL; @@ -622,20 +620,19 @@ void rebuild_sched_domains(void) goto rebuild; } - q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL); - if (IS_ERR(q)) - goto done; csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; - cp = &top_cpuset; - __kfifo_put(q, (void *)&cp, sizeof(cp)); - while (__kfifo_get(q, (void *)&cp, sizeof(cp))) { + list_add(&top_cpuset.stack_list, &q); + while (!list_empty(&q)) { struct cgroup *cont; struct cpuset *child; /* scans child cpusets of cp */ + cp = list_first_entry(&q, struct cpuset, stack_list); + list_del(q.next); + if (cpus_empty(cp->cpus_allowed)) continue; @@ -652,7 +649,7 @@ void rebuild_sched_domains(void) list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); - __kfifo_put(q, (void *)&child, sizeof(cp)); + list_add_tail(&child->stack_list, &q); } } @@ -735,8 +732,6 @@ rebuild: put_online_cpus(); done: - if (q && !IS_ERR(q)) - kfifo_free(q); kfree(csa); /* Don't kfree(doms) -- partition_sched_domains() does that. */ /* Don't kfree(dattr) -- partition_sched_domains() does that. */ -- cgit v1.2.3-70-g09d2 From 5def9a3a22e09c99717f41ab7f07ec9e1a1f3ec8 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 29 Jul 2008 22:33:31 -0700 Subject: markers: fix markers read barrier for multiple probes Paul pointed out two incorrect read barriers in the marker handler code in the path where multiple probes are connected. Those are ordering reads of "ptype" (single or multi probe marker), "multi" array pointer, and "multi" array data access. It should be ordered like this : read ptype smp_rmb() read multi array pointer smp_read_barrier_depends() access data referenced by multi array pointer The code with a single probe connected (optimized case, does not have to allocate an array) has correct memory ordering. It applies to kernel 2.6.26.x, 2.6.25.x and linux-next. Signed-off-by: Mathieu Desnoyers Cc: "Paul E. McKenney" Cc: [2.6.25.x, 2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/marker.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index 971da531790..7d1faecd7a5 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -125,6 +125,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) } else { struct marker_probe_closure *multi; int i; + /* + * Read mdata->ptype before mdata->multi. + */ + smp_rmb(); + multi = mdata->multi; /* * multi points to an array, therefore accessing the array * depends on reading multi. However, even in this case, @@ -133,7 +138,6 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...) * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = mdata->multi; for (i = 0; multi[i].func; i++) { va_start(args, call_private); multi[i].func(multi[i].probe_private, call_private, @@ -174,6 +178,11 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) } else { struct marker_probe_closure *multi; int i; + /* + * Read mdata->ptype before mdata->multi. + */ + smp_rmb(); + multi = mdata->multi; /* * multi points to an array, therefore accessing the array * depends on reading multi. However, even in this case, @@ -182,7 +191,6 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...) * in the fast path, so put the explicit barrier here. */ smp_read_barrier_depends(); - multi = mdata->multi; for (i = 0; multi[i].func; i++) multi[i].func(multi[i].probe_private, call_private, mdata->format, &args); -- cgit v1.2.3-70-g09d2 From 641de9d8f505db055d451b50e6e38117f84e79bb Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Tue, 29 Jul 2008 22:33:38 -0700 Subject: printk: fix comment for printk ratelimiting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The comment assumed the burst to be one and the ratelimit used to be named printk_ratelimit_jiffies. Signed-off-by: Uwe Kleine-König Cc: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index a7f7559c5f6..b51b1567bb5 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1309,14 +1309,14 @@ void tty_write_message(struct tty_struct *tty, char *msg) #if defined CONFIG_PRINTK -DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); /* * printk rate limiting, lifted from the networking subsystem. * - * This enforces a rate limit: not more than one kernel message - * every printk_ratelimit_jiffies to make a denial-of-service - * attack impossible. + * This enforces a rate limit: not more than 10 kernel messages + * every 5s to make a denial-of-service attack impossible. */ +DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); + int printk_ratelimit(void) { return __ratelimit(&printk_ratelimit_state); -- cgit v1.2.3-70-g09d2 From 6af8bf3d86d55c98af6e453cb920ddc30867e5c7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 29 Jul 2008 22:33:49 -0700 Subject: workqueues: add comments to __create_workqueue_key() Dmitry Adamushko pointed out that the error handling in __create_workqueue_key() is not clear, add the comment. Signed-off-by: Oleg Nesterov Cc: Dmitry Adamushko Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ec7e4f62aaf..4a26a1382df 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -830,10 +830,21 @@ struct workqueue_struct *__create_workqueue_key(const char *name, start_workqueue_thread(cwq, -1); } else { cpu_maps_update_begin(); + /* + * We must place this wq on list even if the code below fails. + * cpu_down(cpu) can remove cpu from cpu_populated_map before + * destroy_workqueue() takes the lock, in that case we leak + * cwq[cpu]->thread. + */ spin_lock(&workqueue_lock); list_add(&wq->list, &workqueues); spin_unlock(&workqueue_lock); - + /* + * We must initialize cwqs for each possible cpu even if we + * are going to call destroy_workqueue() finally. Otherwise + * cpu_up() can hit the uninitialized cwq once we drop the + * lock. + */ for_each_possible_cpu(cpu) { cwq = init_cpu_workqueue(wq, cpu); if (err || !cpu_online(cpu)) -- cgit v1.2.3-70-g09d2 From f718cd4add5aea9d379faff92f162571e356cc5f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 29 Jul 2008 22:33:52 -0700 Subject: sched: make scheduler sysfs attributes sysdev class devices They are really class devices, but were incorrectly declared. This leads to crashes with the recent changes that makes non normal sysdevs use a different prototype. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andi Kleen Cc: Ingo Molnar Cc: Pierre Ossman Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0236958addc..21f7da94662 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7671,34 +7671,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) } #ifdef CONFIG_SCHED_MC -static ssize_t sched_mc_power_savings_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *page) +static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, + char *page) { return sprintf(page, "%u\n", sched_mc_power_savings); } -static ssize_t sched_mc_power_savings_store(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); } -static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, - sched_mc_power_savings_store); +static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, + sched_mc_power_savings_show, + sched_mc_power_savings_store); #endif #ifdef CONFIG_SCHED_SMT -static ssize_t sched_smt_power_savings_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *page) +static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, + char *page) { return sprintf(page, "%u\n", sched_smt_power_savings); } -static ssize_t sched_smt_power_savings_store(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); } -static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, +static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, + sched_smt_power_savings_show, sched_smt_power_savings_store); #endif -- cgit v1.2.3-70-g09d2 From a9b60bf4c29e07a5a2f26a6f74937972fee9b58b Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 1 Aug 2008 08:39:34 -0500 Subject: kgdb: fix kgdb_validate_break_address to perform a mem write A regression to the kgdb core was found in the case of using the CONFIG_DEBUG_RODATA kernel option. When this option is on, a breakpoint cannot be written into any readonly memory page. When an external debugger requests a breakpoint to get set, the kgdb_validate_break_address() was only checking to see if the address to place the breakpoint was readable and lacked a write check. This patch changes the validate routine to try reading (via the breakpoint set request) and also to try immediately writing the break point. If either fails, an error is correctly returned and the debugger behaves correctly. Then an end user can make the descision to use hardware breakpoints. Also update the documentation to reflect that using CONFIG_DEBUG_RODATA will inhibit the use of software breakpoints. Signed-off-by: Jason Wessel --- Documentation/DocBook/kgdb.tmpl | 10 ++++++++++ kernel/kgdb.c | 26 +++++++++++++++++++------- 2 files changed, 29 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl index 54d3b158d08..372dec20c8d 100644 --- a/Documentation/DocBook/kgdb.tmpl +++ b/Documentation/DocBook/kgdb.tmpl @@ -106,6 +106,16 @@ while debugging the kernel. + If the architecture that you are using supports the kernel option + CONFIG_DEBUG_RODATA, you should consider turning it off. This + option will prevent the use of software breakpoints because it + marks certain regions of the kernel's memory space as read-only. + If kgdb supports it for the architecture you are using, you can + use hardware breakpoints if you desire to run with the + CONFIG_DEBUG_RODATA option turned on, else you need to turn off + this option. + + Next you should choose one of more I/O drivers to interconnect debugging host and debugged target. Early boot debugging requires a KGDB I/O driver that supports early debugging and the driver must be diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 3ec23c3ec97..c0d45b2c4d7 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -166,13 +166,6 @@ early_param("nokgdbroundup", opt_nokgdbroundup); * Weak aliases for breakpoint management, * can be overriden by architectures when needed: */ -int __weak kgdb_validate_break_address(unsigned long addr) -{ - char tmp_variable[BREAK_INSTR_SIZE]; - - return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE); -} - int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) { int err; @@ -191,6 +184,25 @@ int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) (char *)bundle, BREAK_INSTR_SIZE); } +int __weak kgdb_validate_break_address(unsigned long addr) +{ + char tmp_variable[BREAK_INSTR_SIZE]; + int err; + /* Validate setting the breakpoint and then removing it. In the + * remove fails, the kernel needs to emit a bad message because we + * are deep trouble not being able to put things back the way we + * found them. + */ + err = kgdb_arch_set_breakpoint(addr, tmp_variable); + if (err) + return err; + err = kgdb_arch_remove_breakpoint(addr, tmp_variable); + if (err) + printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " + "memory destroyed at: %lx", addr); + return err; +} + unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) { return instruction_pointer(regs); -- cgit v1.2.3-70-g09d2 From 25fc999913839a45cbb48ac7872e67f7521e7ed9 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 1 Aug 2008 08:39:35 -0500 Subject: kgdb: fix gdb serial thread queries The command "info threads" did not work correctly with kgdb. It would result in a silent kernel hang if used. This patach addresses several problems. - Fix use of deprecated NR_CPUS - Fix kgdb to not walk linearly through the pid space - Correctly implement shadow pids - Change the threads per query to a #define - Fix kgdb_hex2long to work with negated values The threads 0 and -1 are reserved to represent the current task. That means that CPU 0 will start with a shadow thread id of -2, and CPU 1 will have a shadow thread id of -3, etc... From the debugger you can switch to a shadow thread to see what one of the other cpus was doing, however it is not possible to execute run control operations on any other cpu execept the cpu executing the kgdb_handle_exception(). Signed-off-by: Jason Wessel --- kernel/kgdb.c | 68 +++++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 50 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/kgdb.c b/kernel/kgdb.c index c0d45b2c4d7..eaa21fc9ad1 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -56,12 +56,14 @@ static int kgdb_break_asap; +#define KGDB_MAX_THREAD_QUERY 17 struct kgdb_state { int ex_vector; int signo; int err_code; int cpu; int pass_exception; + unsigned long thr_query; unsigned long threadid; long kgdb_usethreadid; struct pt_regs *linux_regs; @@ -445,9 +447,14 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val) { int hex_val; int num = 0; + int negate = 0; *long_val = 0; + if (**ptr == '-') { + negate = 1; + (*ptr)++; + } while (**ptr) { hex_val = hex(**ptr); if (hex_val < 0) @@ -458,6 +465,9 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val) (*ptr)++; } + if (negate) + *long_val = -*long_val; + return num; } @@ -527,10 +537,16 @@ static void int_to_threadref(unsigned char *id, int value) static struct task_struct *getthread(struct pt_regs *regs, int tid) { /* - * Non-positive TIDs are remapped idle tasks: + * Non-positive TIDs are remapped to the cpu shadow information */ - if (tid <= 0) - return idle_task(-tid); + if (tid == 0 || tid == -1) + tid = -atomic_read(&kgdb_active) - 2; + if (tid < 0) { + if (kgdb_info[-tid - 2].task) + return kgdb_info[-tid - 2].task; + else + return idle_task(-tid - 2); + } /* * find_task_by_pid_ns() does not take the tasklist lock anymore @@ -737,14 +753,15 @@ setundefined: } /* - * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs: + * Remap normal tasks to their real PID, + * CPU shadow threads are mapped to -CPU - 2 */ static inline int shadow_pid(int realpid) { if (realpid) return realpid; - return -1-raw_smp_processor_id(); + return -raw_smp_processor_id() - 2; } static char gdbmsgbuf[BUFMAX + 1]; @@ -838,7 +855,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks) local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; } else { local_debuggerinfo = NULL; - for (i = 0; i < NR_CPUS; i++) { + for_each_online_cpu(i) { /* * Try to find the task on some other * or possibly this node if we do not @@ -972,10 +989,13 @@ static int gdb_cmd_reboot(struct kgdb_state *ks) /* Handle the 'q' query packets */ static void gdb_cmd_query(struct kgdb_state *ks) { - struct task_struct *thread; + struct task_struct *g; + struct task_struct *p; unsigned char thref[8]; char *ptr; int i; + int cpu; + int finished = 0; switch (remcom_in_buffer[1]) { case 's': @@ -985,22 +1005,34 @@ static void gdb_cmd_query(struct kgdb_state *ks) break; } - if (remcom_in_buffer[1] == 'f') - ks->threadid = 1; - + i = 0; remcom_out_buffer[0] = 'm'; ptr = remcom_out_buffer + 1; - - for (i = 0; i < 17; ks->threadid++) { - thread = getthread(ks->linux_regs, ks->threadid); - if (thread) { - int_to_threadref(thref, ks->threadid); + if (remcom_in_buffer[1] == 'f') { + /* Each cpu is a shadow thread */ + for_each_online_cpu(cpu) { + ks->thr_query = 0; + int_to_threadref(thref, -cpu - 2); pack_threadid(ptr, thref); ptr += BUF_THREAD_ID_SIZE; *(ptr++) = ','; i++; } } + + do_each_thread(g, p) { + if (i >= ks->thr_query && !finished) { + int_to_threadref(thref, p->pid); + pack_threadid(ptr, thref); + ptr += BUF_THREAD_ID_SIZE; + *(ptr++) = ','; + ks->thr_query++; + if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) + finished = 1; + } + i++; + } while_each_thread(g, p); + *(--ptr) = '\0'; break; @@ -1023,15 +1055,15 @@ static void gdb_cmd_query(struct kgdb_state *ks) error_packet(remcom_out_buffer, -EINVAL); break; } - if (ks->threadid > 0) { + if ((int)ks->threadid > 0) { kgdb_mem2hex(getthread(ks->linux_regs, ks->threadid)->comm, remcom_out_buffer, 16); } else { static char tmpstr[23 + BUF_THREAD_ID_SIZE]; - sprintf(tmpstr, "Shadow task %d for pid 0", - (int)(-ks->threadid-1)); + sprintf(tmpstr, "shadowCPU%d", + (int)(-ks->threadid - 2)); kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); } break; -- cgit v1.2.3-70-g09d2 From ee1d315663ee0b494898f813a266d6244b263b4f Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 7 Jul 2008 10:49:45 -0400 Subject: [PATCH] Audit: Collect signal info when SIGUSR2 is sent to auditd Makes the kernel audit subsystem collect information about the sending process when that process sends SIGUSR2 to the userspace audit daemon. SIGUSR2 is a new interesting signal to auditd telling auditd that it should try to start logging to disk again and the error condition which caused it to stop logging to disk (usually out of space) has been rectified. Signed-off-by: Eric Paris Signed-off-by: Al Viro --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4699950e65b..580a5389fd9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2375,7 +2375,7 @@ int __audit_signal_info(int sig, struct task_struct *t) struct audit_context *ctx = tsk->audit_context; if (audit_pid && t->tgid == audit_pid) { - if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { + if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { audit_sig_pid = tsk->pid; if (tsk->loginuid != -1) audit_sig_uid = tsk->loginuid; -- cgit v1.2.3-70-g09d2 From 1d6c9649e236caa2e93e3647256216e57172b011 Mon Sep 17 00:00:00 2001 From: Vesa-Matti J Kari Date: Wed, 23 Jul 2008 00:06:13 +0300 Subject: kernel/audit.c control character detection is off-by-one Hello, According to my understanding there is an off-by-one bug in the function: audit_string_contains_control() in: kernel/audit.c Patch is included. I do not know from how many places the function is called from, but for example, SELinux Access Vector Cache tries to log untrusted filenames via call path: avc_audit() audit_log_untrustedstring() audit_log_n_untrustedstring() audit_string_contains_control() If audit_string_contains_control() detects control characters, then the string is hex-encoded. But the hex=0x7f dec=127, DEL-character, is not detected. I guess this could have at least some minor security implications, since a user can create a filename with 0x7f in it, causing logged filename to possibly look different when someone reads it on the terminal. Signed-off-by: Vesa-Matti Kari Signed-off-by: Al Viro --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e092f1c0ce3..6d903182c6b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1366,7 +1366,7 @@ int audit_string_contains_control(const char *string, size_t len) { const unsigned char *p; for (p = string; p < (const unsigned char *)string + len && *p; p++) { - if (*p == '"' || *p < 0x21 || *p > 0x7f) + if (*p == '"' || *p < 0x21 || *p > 0x7e) return 1; } return 0; -- cgit v1.2.3-70-g09d2 From 036bbf76ad9f83781590623111b80ba0b82930ac Mon Sep 17 00:00:00 2001 From: zhangxiliang Date: Fri, 1 Aug 2008 09:47:01 +0800 Subject: Re: [PATCH] the loginuid field should be output in all AUDIT_CONFIG_CHANGE audit messages > shouldn't these be using the "audit_get_loginuid(current)" and if we > are going to output loginuid we also should be outputting sessionid Thanks for your detailed explanation. I have made a new patch for outputing "loginuid" and "sessionid" by audit_get_loginuid(current) and audit_get_sessionid(current). If there are some deficiencies, please give me your indication. Signed-off-by: Zhang Xiliang Signed-off-by: Al Viro --- kernel/auditfilter.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 98c50cc671b..b7d354e2b0e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1022,8 +1022,11 @@ static void audit_update_watch(struct audit_parent *parent, struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "auid=%u ses=%u", + audit_get_loginuid(current), + audit_get_sessionid(current)); audit_log_format(ab, - "op=updated rules specifying path="); + " op=updated rules specifying path="); audit_log_untrustedstring(ab, owatch->path); audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); @@ -1058,7 +1061,10 @@ static void audit_remove_parent_watches(struct audit_parent *parent) struct audit_buffer *ab; ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); - audit_log_format(ab, "op=remove rule path="); + audit_log_format(ab, "auid=%u ses=%u", + audit_get_loginuid(current), + audit_get_sessionid(current)); + audit_log_format(ab, " op=remove rule path="); audit_log_untrustedstring(ab, w->path); if (r->filterkey) { audit_log_format(ab, " key="); -- cgit v1.2.3-70-g09d2 From 980dfb0db340b95094732d78b55311f2c539c1af Mon Sep 17 00:00:00 2001 From: zhangxiliang Date: Fri, 1 Aug 2008 19:15:47 +0800 Subject: [PATCH] Fix the kernel panic of audit_filter_task when key field is set When calling audit_filter_task(), it calls audit_filter_rules() with audit_context is NULL. If the key field is set, the result in audit_filter_rules() will be set to 1 and ctx->filterkey will be set to key. But the ctx is NULL in this condition, so kernel will panic. Signed-off-by: Zhang Xiliang Signed-off-by: Al Viro --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 580a5389fd9..496c3dd3727 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -610,7 +610,7 @@ static int audit_filter_rules(struct task_struct *tsk, if (!result) return 0; } - if (rule->filterkey) + if (rule->filterkey && ctx) ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; -- cgit v1.2.3-70-g09d2 From 20c6aaa39ab735c7ed78e4e5a214d250efae0a6e Mon Sep 17 00:00:00 2001 From: zhangxiliang Date: Thu, 31 Jul 2008 10:11:19 +0800 Subject: [PATCH] Fix the bug of using AUDIT_STATUS_RATE_LIMIT when set fail, no error output. When the "status_get->mask" is "AUDIT_STATUS_RATE_LIMIT || AUDIT_STATUS_BACKLOG_LIMIT". If "audit_set_rate_limit" fails and "audit_set_backlog_limit" succeeds, the "err" value will be greater than or equal to 0. It will miss the failure of rate set. Signed-off-by: Zhang Xiliang Acked-by: Eric Paris Signed-off-by: Al Viro --- kernel/audit.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 6d903182c6b..4414e93d875 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -707,12 +707,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_ENABLED) { err = audit_set_enabled(status_get->enabled, loginuid, sessionid, sid); - if (err < 0) return err; + if (err < 0) + return err; } if (status_get->mask & AUDIT_STATUS_FAILURE) { err = audit_set_failure(status_get->failure, loginuid, sessionid, sid); - if (err < 0) return err; + if (err < 0) + return err; } if (status_get->mask & AUDIT_STATUS_PID) { int new_pid = status_get->pid; @@ -725,9 +727,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_pid = new_pid; audit_nlk_pid = NETLINK_CB(skb).pid; } - if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) + if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(status_get->rate_limit, loginuid, sessionid, sid); + if (err < 0) + return err; + } if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) err = audit_set_backlog_limit(status_get->backlog_limit, loginuid, sessionid, sid); -- cgit v1.2.3-70-g09d2 From 5c7edcd7ee6b77b88252fe4096dce1a46a60c829 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Thu, 31 Jul 2008 02:04:09 -0700 Subject: tracehook: fix exit_signal=0 case My commit 2b2a1ff64afbadac842bbc58c5166962cf4f7664 introduced a regression (sorry about that) for the odd case of exit_signal=0 (e.g. clone_flags=0). This is not a normal use, but it's used by a case in the glibc test suite. Dying with exit_signal=0 sends no signal, but it's supposed to wake up a parent's blocked wait*() calls (unlike the delayed_group_leader case). This fixes tracehook_notify_death() and its caller to distinguish a "signal 0" wakeup from the delayed_group_leader case (with no wakeup). Signed-off-by: Roland McGrath Tested-by: Serge Hallyn Signed-off-by: Linus Torvalds --- include/linux/tracehook.h | 21 +++++++++++++-------- kernel/exit.c | 6 +++--- 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index b1875582c1a..12532839f50 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -493,16 +493,21 @@ static inline int tracehook_notify_jctl(int notify, int why) * @death_cookie: value to pass to tracehook_report_death() * @group_dead: nonzero if this was the last thread in the group to die * - * Return the signal number to send our parent with do_notify_parent(), or - * zero to send no signal and leave a zombie, or -1 to self-reap right now. + * A return value >= 0 means call do_notify_parent() with that signal + * number. Negative return value can be %DEATH_REAP to self-reap right + * now, or %DEATH_DELAYED_GROUP_LEADER to a zombie without notifying our + * parent. Note that a return value of 0 means a do_notify_parent() call + * that sends no signal, but still wakes up a parent blocked in wait*(). * * Called with write_lock_irq(&tasklist_lock) held. */ +#define DEATH_REAP -1 +#define DEATH_DELAYED_GROUP_LEADER -2 static inline int tracehook_notify_death(struct task_struct *task, void **death_cookie, int group_dead) { if (task->exit_signal == -1) - return task->ptrace ? SIGCHLD : -1; + return task->ptrace ? SIGCHLD : DEATH_REAP; /* * If something other than our normal parent is ptracing us, then @@ -512,21 +517,21 @@ static inline int tracehook_notify_death(struct task_struct *task, if (thread_group_empty(task) && !ptrace_reparented(task)) return task->exit_signal; - return task->ptrace ? SIGCHLD : 0; + return task->ptrace ? SIGCHLD : DEATH_DELAYED_GROUP_LEADER; } /** * tracehook_report_death - task is dead and ready to be reaped * @task: @current task now exiting - * @signal: signal number sent to parent, or 0 or -1 + * @signal: return value from tracheook_notify_death() * @death_cookie: value passed back from tracehook_notify_death() * @group_dead: nonzero if this was the last thread in the group to die * * Thread has just become a zombie or is about to self-reap. If positive, * @signal is the signal number just sent to the parent (usually %SIGCHLD). - * If @signal is -1, this thread will self-reap. If @signal is 0, this is - * a delayed_group_leader() zombie. The @death_cookie was passed back by - * tracehook_notify_death(). + * If @signal is %DEATH_REAP, this thread will self-reap. If @signal is + * %DEATH_DELAYED_GROUP_LEADER, this is a delayed_group_leader() zombie. + * The @death_cookie was passed back by tracehook_notify_death(). * * If normal reaping is not inhibited, @task->exit_state might be changing * in parallel. diff --git a/kernel/exit.c b/kernel/exit.c index eb4d6470d1d..38ec4063014 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -911,10 +911,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tsk->exit_signal = SIGCHLD; signal = tracehook_notify_death(tsk, &cookie, group_dead); - if (signal > 0) + if (signal >= 0) signal = do_notify_parent(tsk, signal); - tsk->exit_state = signal < 0 ? EXIT_DEAD : EXIT_ZOMBIE; + tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; /* mt-exec, de_thread() is waiting for us */ if (thread_group_leader(tsk) && @@ -927,7 +927,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tracehook_report_death(tsk, signal, cookie, group_dead); /* If the process is dead, release it - nobody will wait for it */ - if (signal < 0) + if (signal == DEATH_REAP) release_task(tsk); } -- cgit v1.2.3-70-g09d2 From 1a61c88defcd611bd148d6c960b498e1b8bbbe00 Mon Sep 17 00:00:00 2001 From: zhangxiliang Date: Sat, 2 Aug 2008 10:56:37 +0800 Subject: Re: [PATCH] Fix the kernel panic of audit_filter_task when key field is set Sorry, I miss a blank between if and "(". And I add "unlikely" to check "ctx" in audit_match_perm() and audit_match_filetype(). This is a new patch for it. Signed-off-by: Zhang Xiliang Signed-off-by: Al Viro --- kernel/auditsc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 496c3dd3727..972f8e61d36 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -243,6 +243,9 @@ static inline int open_arg(int flags, int mask) static int audit_match_perm(struct audit_context *ctx, int mask) { + if (unlikely(!ctx)) + return 0; + unsigned n = ctx->major; switch (audit_classify_syscall(ctx->arch, n)) { case 0: /* native */ @@ -284,6 +287,10 @@ static int audit_match_filetype(struct audit_context *ctx, int which) { unsigned index = which & ~S_IFMT; mode_t mode = which & S_IFMT; + + if (unlikely(!ctx)) + return 0; + if (index >= ctx->name_count) return 0; if (ctx->names[index].ino == -1) -- cgit v1.2.3-70-g09d2 From 725aad24c3ba96a7c06448c14c265a466cdbd663 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 3 Aug 2008 09:33:03 -0700 Subject: __sched_setscheduler: don't do any policy checks when not "user" The "user" parameter to __sched_setscheduler indicates whether the change is being done on behalf of a user process or not. If not, we shouldn't apply any permissions checks, so don't call security_task_setscheduler(). Signed-off-by: Jeremy Fitzhardinge Tested-by: Steve Wise Cc: Rusty Russell Cc: "Rafael J. Wysocki" Signed-off-by: Linus Torvalds --- kernel/sched.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 21f7da94662..04160d277e7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5004,19 +5004,21 @@ recheck: return -EPERM; } + if (user) { #ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (user - && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) - return -EPERM; + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) + return -EPERM; #endif - retval = security_task_setscheduler(p, policy, param); - if (retval) - return retval; + retval = security_task_setscheduler(p, policy, param); + if (retval) + return retval; + } + /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: -- cgit v1.2.3-70-g09d2 From 32194450330be327f3b25bf6b66298bd122599e9 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 5 Aug 2008 13:01:10 -0700 Subject: relay: fix "full buffer with exactly full last subbuffer" accounting problem In relay's current read implementation, if the buffer is completely full but hasn't triggered the buffer-full condition (i.e. the last write didn't cross the subbuffer boundary) and the last subbuffer is exactly full, the subbuffer accounting code erroneously finds nothing available. This patch fixes the problem. Signed-off-by: Tom Zanussi Cc: Eduard - Gabriel Munteanu Cc: Pekka Enberg Cc: Jens Axboe Cc: Mathieu Desnoyers Cc: Andrea Righi Cc: [2.6.25.x, 2.6.26.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 04006ef970b..8d13a7855c0 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -944,6 +944,10 @@ static void relay_file_read_consume(struct rchan_buf *buf, size_t n_subbufs = buf->chan->n_subbufs; size_t read_subbuf; + if (buf->subbufs_produced == buf->subbufs_consumed && + buf->offset == buf->bytes_consumed) + return; + if (buf->bytes_consumed + bytes_consumed > subbuf_size) { relay_subbufs_consumed(buf->chan, buf->cpu, 1); buf->bytes_consumed = 0; @@ -975,6 +979,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) relay_file_read_consume(buf, read_pos, 0); + consumed = buf->subbufs_consumed; + if (unlikely(buf->offset > subbuf_size)) { if (produced == consumed) return 0; @@ -993,8 +999,12 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) if (consumed > produced) produced += n_subbufs * subbuf_size; - if (consumed == produced) + if (consumed == produced) { + if (buf->offset == subbuf_size && + buf->subbufs_produced > buf->subbufs_consumed) + return 1; return 0; + } return 1; } -- cgit v1.2.3-70-g09d2 From 5b2becc8cffdccdd60c63099f592ddd35aa6c34f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 5 Aug 2008 13:01:13 -0700 Subject: semaphore: __down_common: use signal_pending_state() Change __down_common() to use signal_pending_state() instead of open coding. The changes in kernel/semaphore.o are just artifacts, the state checks are optimized away. Signed-off-by: Oleg Nesterov Cc: Ingo Molnar Cc: Matthew Wilcox Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/semaphore.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/semaphore.c b/kernel/semaphore.c index aaaeae8244e..94a62c0d4ad 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -212,9 +212,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state, waiter.up = 0; for (;;) { - if (state == TASK_INTERRUPTIBLE && signal_pending(task)) - goto interrupted; - if (state == TASK_KILLABLE && fatal_signal_pending(task)) + if (signal_pending_state(state, task)) goto interrupted; if (timeout <= 0) goto timed_out; -- cgit v1.2.3-70-g09d2 From c69ad71bcdecbaab82cfacb1dc967bd7fd967a3b Mon Sep 17 00:00:00 2001 From: David Brownell Date: Tue, 5 Aug 2008 13:01:14 -0700 Subject: genirq: better warning on irqchip->set_type() failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While I'm glad to finally see the hole fixed whereby passing an invalid IRQ trigger type to request_irq() would be ignored, the current diagnostic isn't quite useful. Fixed by also listing the trigger type which was rejected. Signed-off-by: David Brownell Acked-by: Uwe Kleine-König Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 152abfd3589..0314074fa23 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -323,7 +323,8 @@ static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq, ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK); if (ret) - pr_err("setting flow type for irq %u failed (%pF)\n", + pr_err("setting trigger mode %d for irq %u failed (%pF)\n", + (int)(flags & IRQF_TRIGGER_MASK), irq, chip->set_type); return ret; -- cgit v1.2.3-70-g09d2 From d2dc1f4adb4b5b02d87e49e115e5107f4da790c0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 5 Aug 2008 13:01:31 -0700 Subject: dma: fix order calculation in dma_mark_declared_memory_occupied() get_order() takes byte-sized input, not a page-granular one. Irrespective of this fix I'm inclined to believe that this doesn't work right anyway - bitmap_allocate_region() has an implicit assumption of 'pos' being suitable for 'order', which this function doesn't seem to enforce (and since it's being called with a byte-granular value there's no reason to believe that the callers would make sure device_addr is passed accordingly - it's also not documented that way). Signed-off-by: Jan Beulich Cc: James E.J. Bottomley Cc: Ingo Molnar Cc: Dmitry Baryshkov Cc: Jesse Barnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/dma-coherent.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c index 7517115a8cc..91e96950cd5 100644 --- a/kernel/dma-coherent.c +++ b/kernel/dma-coherent.c @@ -77,15 +77,14 @@ void *dma_mark_declared_memory_occupied(struct device *dev, { struct dma_coherent_mem *mem = dev->dma_mem; int pos, err; - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1); - pages >>= PAGE_SHIFT; + size += device_addr & ~PAGE_MASK; if (!mem) return ERR_PTR(-EINVAL); pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); + err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); if (err != 0) return ERR_PTR(err); return mem->virt_base + (pos << PAGE_SHIFT); -- cgit v1.2.3-70-g09d2 From bf1db69fbf4ff511e88736ce2e6318846f34492b Mon Sep 17 00:00:00 2001 From: Richard Hughes Date: Tue, 5 Aug 2008 13:01:35 -0700 Subject: pm_qos: spelling fixes A documentation cleanup patch. With a minor tweak to clarify units for kbs. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: mark gross Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/power/pm_qos_interface.txt | 7 ++++++- include/linux/pm_qos_params.h | 2 +- kernel/pm_qos_params.c | 16 ++++++++-------- 3 files changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt index 49adb1a3351..c40866e8b95 100644 --- a/Documentation/power/pm_qos_interface.txt +++ b/Documentation/power/pm_qos_interface.txt @@ -1,4 +1,4 @@ -PM quality of Service interface. +PM Quality Of Service Interface. This interface provides a kernel and user mode interface for registering performance expectations by drivers, subsystems and user space applications on @@ -7,6 +7,11 @@ one of the parameters. Currently we have {cpu_dma_latency, network_latency, network_throughput} as the initial set of pm_qos parameters. +Each parameters have defined units: + * latency: usec + * timeout: usec + * throughput: kbs (kilo bit / sec) + The infrastructure exposes multiple misc device nodes one per implemented parameter. The set of parameters implement is defined by pm_qos_power_init() and pm_qos_params.h. This is done because having the available parameters diff --git a/include/linux/pm_qos_params.h b/include/linux/pm_qos_params.h index 2e4e97bd19f..d74f75ed1e4 100644 --- a/include/linux/pm_qos_params.h +++ b/include/linux/pm_qos_params.h @@ -1,6 +1,6 @@ /* interface for the pm_qos_power infrastructure of the linux kernel. * - * Mark Gross + * Mark Gross */ #include #include diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 8cb75702638..da9c2dda6a4 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -24,7 +24,7 @@ * requirement that the application has is cleaned up when closes the file * pointer or exits the pm_qos_object will get an opportunity to clean up. * - * mark gross mgross@linux.intel.com + * Mark Gross */ #include @@ -211,8 +211,8 @@ EXPORT_SYMBOL_GPL(pm_qos_requirement); * @value: defines the qos request * * This function inserts a new entry in the pm_qos_class list of requested qos - * performance charactoistics. It recomputes the agregate QoS expectations for - * the pm_qos_class of parrameters. + * performance characteristics. It recomputes the aggregate QoS expectations + * for the pm_qos_class of parameters. */ int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) { @@ -250,10 +250,10 @@ EXPORT_SYMBOL_GPL(pm_qos_add_requirement); * @name: identifies the request * @value: defines the qos request * - * Updates an existing qos requierement for the pm_qos_class of parameters along + * Updates an existing qos requirement for the pm_qos_class of parameters along * with updating the target pm_qos_class value. * - * If the named request isn't in the lest then no change is made. + * If the named request isn't in the list then no change is made. */ int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) { @@ -287,7 +287,7 @@ EXPORT_SYMBOL_GPL(pm_qos_update_requirement); * @pm_qos_class: identifies which list of qos request to us * @name: identifies the request * - * Will remove named qos request from pm_qos_class list of parrameters and + * Will remove named qos request from pm_qos_class list of parameters and * recompute the current target value for the pm_qos_class. */ void pm_qos_remove_requirement(int pm_qos_class, char *name) @@ -319,7 +319,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); * @notifier: notifier block managed by caller. * * will register the notifier into a notification chain that gets called - * uppon changes to the pm_qos_class target value. + * upon changes to the pm_qos_class target value. */ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) { @@ -338,7 +338,7 @@ EXPORT_SYMBOL_GPL(pm_qos_add_notifier); * @notifier: notifier block to be removed. * * will remove the notifier from the notification chain that gets called - * uppon changes to the pm_qos_class target value. + * upon changes to the pm_qos_class target value. */ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) { -- cgit v1.2.3-70-g09d2