From f6d87f4bd259cf33e092cd1a8fde05f291c47af1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Nov 2008 13:18:30 +0100 Subject: genirq: keep affinities set from userspace across free/request_irq() Impact: preserve user-modified affinities on interrupts Kumar Galak noticed that commit 18404756765c713a0be4eb1082920c04822ce588 (genirq: Expose default irq affinity mask (take 3)) overrides an already set affinity setting across a free / request_irq(). Happens e.g. with ifdown/ifup of a network device. Change the logic to mark the affinities as set and keep them intact. This also fixes the unlocked access to irq_desc in irq_select_affinity() when called from irq_affinity_proc_write() Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/irq/internals.h | 2 ++ kernel/irq/manage.c | 58 +++++++++++++++++++++++++++++++++++++++++--------- kernel/irq/migration.c | 11 ---------- kernel/irq/proc.c | 2 +- 4 files changed, 51 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index c9767e64198..64c1c7253da 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -25,6 +25,8 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif +extern int irq_select_affinity_usr(unsigned int irq); + /* * Debugging printout: */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c498a1b8c62..634a2a95510 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -82,24 +82,27 @@ int irq_can_set_affinity(unsigned int irq) int irq_set_affinity(unsigned int irq, cpumask_t cpumask) { struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; if (!desc->chip->set_affinity) return -EINVAL; + spin_lock_irqsave(&desc->lock, flags); + #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); desc->affinity = cpumask; desc->chip->set_affinity(irq, cpumask); - spin_unlock_irqrestore(&desc->lock, flags); - } else - set_pending_irq(irq, cpumask); + } else { + desc->status |= IRQ_MOVE_PENDING; + desc->pending_mask = cpumask; + } #else desc->affinity = cpumask; desc->chip->set_affinity(irq, cpumask); #endif + desc->status |= IRQ_AFFINITY_SET; + spin_unlock_irqrestore(&desc->lock, flags); return 0; } @@ -107,24 +110,59 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask) /* * Generic version of the affinity autoselector. */ -int irq_select_affinity(unsigned int irq) +int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) { cpumask_t mask; - struct irq_desc *desc; if (!irq_can_set_affinity(irq)) return 0; cpus_and(mask, cpu_online_map, irq_default_affinity); - desc = irq_to_desc(irq); + /* + * Preserve an userspace affinity setup, but make sure that + * one of the targets is online. + */ + if (desc->status & IRQ_AFFINITY_SET) { + if (cpus_intersects(desc->affinity, cpu_online_map)) + mask = desc->affinity; + else + desc->status &= ~IRQ_AFFINITY_SET; + } + desc->affinity = mask; desc->chip->set_affinity(irq, mask); return 0; } +#else +static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d) +{ + return irq_select_affinity(irq); +} #endif +/* + * Called when affinity is set via /proc/irq + */ +int irq_select_affinity_usr(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + int ret; + + spin_lock_irqsave(&desc->lock, flags); + ret = do_irq_select_affinity(irq, desc); + spin_unlock_irqrestore(&desc->lock, flags); + + return ret; +} + +#else +static inline int do_select_irq_affinity(int irq, struct irq_desc *desc) +{ + return 0; +} #endif /** @@ -446,7 +484,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) desc->depth = 1; /* Set default affinity mask once everything is setup */ - irq_select_affinity(irq); + do_irq_select_affinity(irq, desc); } else if ((new->flags & IRQF_TRIGGER_MASK) && (new->flags & IRQF_TRIGGER_MASK) diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 90b920d3f52..9db681d9581 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -1,17 +1,6 @@ #include -void set_pending_irq(unsigned int irq, cpumask_t mask) -{ - struct irq_desc *desc = irq_to_desc(irq); - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); - desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = mask; - spin_unlock_irqrestore(&desc->lock, flags); -} - void move_masked_irq(int irq) { struct irq_desc *desc = irq_to_desc(irq); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 4d161c70ba5..d257e7d6a8a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -62,7 +62,7 @@ static ssize_t irq_affinity_proc_write(struct file *file, if (!cpus_intersects(new_value, cpu_online_map)) /* Special case for empty set - allow the architecture code to set default SMP affinity. */ - return irq_select_affinity(irq) ? -EINVAL : count; + return irq_select_affinity_usr(irq) ? -EINVAL : count; irq_set_affinity(irq, new_value); -- cgit v1.2.3-70-g09d2 From 612e3684c1b7752d2890510e4f90115fd1eb2afb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 7 Nov 2008 13:58:46 +0100 Subject: genirq: fix the affinity setting in setup_irq The affinity setting in setup irq is called before the NO_BALANCING flag is checked and might therefore override affinity settings from the calling code with the default setting. Move the NO_BALANCING flag check before the call to the affinity setting. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 634a2a95510..948a22a2c01 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -123,7 +123,7 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) * Preserve an userspace affinity setup, but make sure that * one of the targets is online. */ - if (desc->status & IRQ_AFFINITY_SET) { + if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { if (cpus_intersects(desc->affinity, cpu_online_map)) mask = desc->affinity; else @@ -483,6 +483,10 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) /* Undo nested disables: */ desc->depth = 1; + /* Exclude IRQ from balancing if requested */ + if (new->flags & IRQF_NOBALANCING) + desc->status |= IRQ_NO_BALANCING; + /* Set default affinity mask once everything is setup */ do_irq_select_affinity(irq, desc); @@ -497,10 +501,6 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new) *p = new; - /* Exclude IRQ from balancing */ - if (new->flags & IRQF_NOBALANCING) - desc->status |= IRQ_NO_BALANCING; - /* Reset broken irq detection when installing new handler */ desc->irq_count = 0; desc->irqs_unhandled = 0; -- cgit v1.2.3-70-g09d2 From f131e2436ddbac2527bb2d6297a823aae4b024f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 8 Nov 2008 09:57:40 +0100 Subject: irq: fix typo Impact: build fix fix build failure on UP. Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 948a22a2c01..435861284e4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -159,7 +159,7 @@ int irq_select_affinity_usr(unsigned int irq) } #else -static inline int do_select_irq_affinity(int irq, struct irq_desc *desc) +static inline int do_irq_select_affinity(int irq, struct irq_desc *desc) { return 0; } -- cgit v1.2.3-70-g09d2 From a358324466b171e145df20bdb74fe81759906de6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 11 Nov 2008 15:01:42 -0500 Subject: ring-buffer: buffer record on/off switch Impact: enable/disable ring buffer recording API added Several kernel developers have requested that there be a way to stop recording into the ring buffers with a simple switch that can also be enabled from userspace. This patch addes a new kernel API to the ring buffers called: tracing_on() tracing_off() When tracing_off() is called, all ring buffers will not be able to record into their buffers. tracing_on() will enable the ring buffers again. These two act like an on/off switch. That is, there is no counting of the number of times tracing_off or tracing_on has been called. A new file is added to the debugfs/tracing directory called tracing_on This allows for userspace applications to also flip the switch. echo 0 > debugfs/tracing/tracing_on disables the tracing. echo 1 > /debugfs/tracing/tracing_on enables it. Note, this does not disable or enable any tracers. It only sets or clears a flag that needs to be set in order for the ring buffers to write to their buffers. It is a global flag, and affects all ring buffers. The buffers start out with tracing_on enabled. There are now three flags that control recording into the buffers: tracing_on: which affects all ring buffer tracers. buffer->record_disabled: which affects an allocated buffer, which may be set if an anomaly is detected, and tracing is disabled. cpu_buffer->record_disabled: which is set by tracing_stop() or if an anomaly is detected. tracing_start can not reenable this if an anomaly occurred. The userspace debugfs/tracing/tracing_enabled is implemented with tracing_stop() but the user space code can not enable it if the kernel called tracing_stop(). Userspace can enable the tracing_on even if the kernel disabled it. It is just a switch used to stop tracing if a condition was hit. tracing_on is not for protecting critical areas in the kernel nor is it for stopping tracing if an anomaly occurred. This is because userspace can reenable it at any time. Side effect: With this patch, I discovered a dead variable in ftrace.c called tracing_on. This patch removes it. Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 3 ++ kernel/trace/ftrace.c | 8 +--- kernel/trace/ring_buffer.c | 101 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 536b0ca46a0..e097c2e6b6d 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -120,6 +120,9 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer); u64 ring_buffer_time_stamp(int cpu); void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); +void tracing_on(void); +void tracing_off(void); + enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4a39d24568c..14fa52297b2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -185,7 +185,6 @@ enum { }; static int ftrace_filtered; -static int tracing_on; static LIST_HEAD(ftrace_new_addrs); @@ -506,13 +505,10 @@ static int __ftrace_modify_code(void *data) { int *command = data; - if (*command & FTRACE_ENABLE_CALLS) { + if (*command & FTRACE_ENABLE_CALLS) ftrace_replace_code(1); - tracing_on = 1; - } else if (*command & FTRACE_DISABLE_CALLS) { + else if (*command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); - tracing_on = 0; - } if (*command & FTRACE_UPDATE_TRACE_FUNC) ftrace_update_ftrace_func(ftrace_trace_function); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2f76193c348..b08ee9f00c8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -16,6 +16,35 @@ #include #include +#include "trace.h" + +/* Global flag to disable all recording to ring buffers */ +static int ring_buffers_off __read_mostly; + +/** + * tracing_on - enable all tracing buffers + * + * This function enables all tracing buffers that may have been + * disabled with tracing_off. + */ +void tracing_on(void) +{ + ring_buffers_off = 0; +} + +/** + * tracing_off - turn off all tracing buffers + * + * This function stops all tracing buffers from recording data. + * It does not disable any overhead the tracers themselves may + * be causing. This function simply causes all recording to + * the ring buffers to fail. + */ +void tracing_off(void) +{ + ring_buffers_off = 1; +} + /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 @@ -1133,6 +1162,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, struct ring_buffer_event *event; int cpu, resched; + if (ring_buffers_off) + return NULL; + if (atomic_read(&buffer->record_disabled)) return NULL; @@ -1249,6 +1281,9 @@ int ring_buffer_write(struct ring_buffer *buffer, int ret = -EBUSY; int cpu, resched; + if (ring_buffers_off) + return -EBUSY; + if (atomic_read(&buffer->record_disabled)) return -EBUSY; @@ -2070,3 +2105,69 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, return 0; } +static ssize_t +rb_simple_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + int *p = filp->private_data; + char buf[64]; + int r; + + /* !ring_buffers_off == tracing_on */ + r = sprintf(buf, "%d\n", !*p); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +rb_simple_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + int *p = filp->private_data; + char buf[64]; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + /* !ring_buffers_off == tracing_on */ + *p = !val; + + (*ppos)++; + + return cnt; +} + +static struct file_operations rb_simple_fops = { + .open = tracing_open_generic, + .read = rb_simple_read, + .write = rb_simple_write, +}; + + +static __init int rb_init_debugfs(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + + entry = debugfs_create_file("tracing_on", 0644, d_tracer, + &ring_buffers_off, &rb_simple_fops); + if (!entry) + pr_warning("Could not create debugfs 'tracing_on' entry\n"); + + return 0; +} + +fs_initcall(rb_init_debugfs); -- cgit v1.2.3-70-g09d2 From 621a0d5207c18012cb39932f2d9830a11a6cb03d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 12 Nov 2008 09:36:35 +0100 Subject: hrtimer: clean up unused callback modes Impact: cleanup git grep HRTIMER_CB_IRQSAFE revealed half the callback modes are actually unused. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/hrtimer.h | 5 ----- kernel/hrtimer.c | 9 --------- 2 files changed, 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 07e510a3b00..3eba43878dc 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -46,9 +46,6 @@ enum hrtimer_restart { * hrtimer callback modes: * * HRTIMER_CB_SOFTIRQ: Callback must run in softirq context - * HRTIMER_CB_IRQSAFE: Callback may run in hardirq context - * HRTIMER_CB_IRQSAFE_NO_RESTART: Callback may run in hardirq context and - * does not restart the timer * HRTIMER_CB_IRQSAFE_PERCPU: Callback must run in hardirq context * Special mode for tick emulation and * scheduler timer. Such timers are per @@ -61,8 +58,6 @@ enum hrtimer_restart { */ enum hrtimer_cb_mode { HRTIMER_CB_SOFTIRQ, - HRTIMER_CB_IRQSAFE, - HRTIMER_CB_IRQSAFE_NO_RESTART, HRTIMER_CB_IRQSAFE_PERCPU, HRTIMER_CB_IRQSAFE_UNLOCKED, }; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 95d3949f2ae..47e63349d1b 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -664,14 +664,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, /* Timer is expired, act upon the callback mode */ switch(timer->cb_mode) { - case HRTIMER_CB_IRQSAFE_NO_RESTART: - debug_hrtimer_deactivate(timer); - /* - * We can call the callback from here. No restart - * happens, so no danger of recursion - */ - BUG_ON(timer->function(timer) != HRTIMER_NORESTART); - return 1; case HRTIMER_CB_IRQSAFE_PERCPU: case HRTIMER_CB_IRQSAFE_UNLOCKED: /* @@ -683,7 +675,6 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, */ debug_hrtimer_deactivate(timer); return 1; - case HRTIMER_CB_IRQSAFE: case HRTIMER_CB_SOFTIRQ: /* * Move everything else into the softirq pending list ! -- cgit v1.2.3-70-g09d2 From 47e74f2ba8fbf9fb1378e2524e6cfdc2fb37f160 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 12 Nov 2008 00:01:27 -0500 Subject: ring-buffer: no preempt for sched_clock() Impact: disable preemption when calling sched_clock() The ring_buffer_time_stamp still uses sched_clock as its counter. But it is a bug to call it with preemption enabled. This requirement should not be pushed to the ring_buffer_time_stamp callers, so the ring_buffer_time_stamp needs to disable preemption when calling sched_clock. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b08ee9f00c8..231db209fa8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -51,8 +51,14 @@ void tracing_off(void) /* FIXME!!! */ u64 ring_buffer_time_stamp(int cpu) { + u64 time; + + preempt_disable_notrace(); /* shift to debug/test normalization and TIME_EXTENTS */ - return sched_clock() << DEBUG_SHIFT; + time = sched_clock() << DEBUG_SHIFT; + preempt_enable_notrace(); + + return time; } void ring_buffer_normalize_time_stamp(int cpu, u64 *ts) -- cgit v1.2.3-70-g09d2 From a2d477778e82a60a0b7114cefdb70aa43af28782 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 12 Nov 2008 16:19:00 +0530 Subject: sched: fix stale value in average load per task Impact: fix load balancer load average calculation accuracy cpu_avg_load_per_task() returns a stale value when nr_running is 0. It returns an older stale (caculated when nr_running was non zero) value. This patch returns and sets rq->avg_load_per_task to zero when nr_running is 0. Compile and boot tested on a x86_64 box. Signed-off-by: Balbir Singh Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 50a21f96467..3bafbe350f4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1456,6 +1456,8 @@ static unsigned long cpu_avg_load_per_task(int cpu) if (rq->nr_running) rq->avg_load_per_task = rq->load.weight / rq->nr_running; + else + rq->avg_load_per_task = 0; return rq->avg_load_per_task; } -- cgit v1.2.3-70-g09d2 From 5cbd54ef470d880fc37fbe4b21eb514806d51e0d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 12 Nov 2008 20:05:50 +0100 Subject: sched: fix init_idle()'s use of sched_clock() Maciej Rutecki reported: > I have this bug during suspend to disk: > > [ 188.592151] Enabling non-boot CPUs ... > [ 188.592151] SMP alternatives: switching to SMP code > [ 188.666058] BUG: using smp_processor_id() in preemptible > [00000000] > code: suspend_to_disk/2934 > [ 188.666064] caller is native_sched_clock+0x2b/0x80 Which, as noted by Linus, was caused by me, via: 7cbaef9c "sched: optimize sched_clock() a bit" Move the rq locking a bit earlier in the initialization sequence, that will make the sched_clock() call in init_idle() non-preemptible. Reported-by: Maciej Rutecki Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3bafbe350f4..c94baf2969e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5870,6 +5870,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; + spin_lock_irqsave(&rq->lock, flags); + __sched_fork(idle); idle->se.exec_start = sched_clock(); @@ -5877,7 +5879,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->cpus_allowed = cpumask_of_cpu(cpu); __set_task_cpu(idle, cpu); - spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) idle->oncpu = 1; -- cgit v1.2.3-70-g09d2 From 687446760bd008df96655cb8c5900f8e48a7118c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 12 Nov 2008 13:26:49 -0800 Subject: freezer_cg: remove task_lock from freezer_fork() In theory the task can be moved to another cgroup and the freezer will be freed right after task_lock is dropped, so the lock results in zero protection. But in the case of freezer_fork() no lock is needed, since the task is not in tasklist yet so it won't be moved to another cgroup, so task->cgroups won't be changed or invalidated. Signed-off-by: Li Zefan Cc: Matt Helsley Cc: Cedric Le Goater Cc: "Serge E. Hallyn" Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup_freezer.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 7fa476f01d0..66059071040 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -184,9 +184,13 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) { struct freezer *freezer; - task_lock(task); + /* + * No lock is needed, since the task isn't on tasklist yet, + * so it can't be moved to another cgroup, which means the + * freezer won't be removed and will be valid during this + * function call. + */ freezer = task_freezer(task); - task_unlock(task); spin_lock_irq(&freezer->lock); BUG_ON(freezer->state == CGROUP_FROZEN); -- cgit v1.2.3-70-g09d2 From 3b1b3f6e57064aa8f91c290fe51cda4c74642902 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 12 Nov 2008 13:26:50 -0800 Subject: freezer_cg: disable writing freezer.state of root cgroup With this change, control file 'freezer.state' doesn't exist in root cgroup, making root cgroup unfreezable. I think it's reasonable to disallow freeze tasks in the root cgroup. And then we can avoid fork overhead when freezer subsystem is compiled but not used. Also make writing invalid value to freezer.state returns EINVAL rather than EIO. This is more consistent with other cgroup subsystem. Signed-off-by: Li Zefan Acked-by: Paul Menage Cc: Cedric Le Goater Cc: Paul Menage Cc: Matt Helsley Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/freezer-subsystem.txt | 21 ++++++++++++--------- kernel/cgroup_freezer.c | 11 ++++++++++- 2 files changed, 22 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroups/freezer-subsystem.txt b/Documentation/cgroups/freezer-subsystem.txt index c50ab58b72e..41f37fea127 100644 --- a/Documentation/cgroups/freezer-subsystem.txt +++ b/Documentation/cgroups/freezer-subsystem.txt @@ -1,4 +1,4 @@ - The cgroup freezer is useful to batch job management system which start +The cgroup freezer is useful to batch job management system which start and stop sets of tasks in order to schedule the resources of a machine according to the desires of a system administrator. This sort of program is often used on HPC clusters to schedule access to the cluster as a @@ -6,7 +6,7 @@ whole. The cgroup freezer uses cgroups to describe the set of tasks to be started/stopped by the batch job management system. It also provides a means to start and stop the tasks composing the job. - The cgroup freezer will also be useful for checkpointing running groups +The cgroup freezer will also be useful for checkpointing running groups of tasks. The freezer allows the checkpoint code to obtain a consistent image of the tasks by attempting to force the tasks in a cgroup into a quiescent state. Once the tasks are quiescent another task can @@ -16,7 +16,7 @@ recoverable error occur. This also allows the checkpointed tasks to be migrated between nodes in a cluster by copying the gathered information to another node and restarting the tasks there. - Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping +Sequences of SIGSTOP and SIGCONT are not always sufficient for stopping and resuming tasks in userspace. Both of these signals are observable from within the tasks we wish to freeze. While SIGSTOP cannot be caught, blocked, or ignored it can be seen by waiting or ptracing parent tasks. @@ -37,26 +37,29 @@ demonstrate this problem using nested bash shells: - This happens because bash can observe both signals and choose how it +This happens because bash can observe both signals and choose how it responds to them. - Another example of a program which catches and responds to these +Another example of a program which catches and responds to these signals is gdb. In fact any program designed to use ptrace is likely to have a problem with this method of stopping and resuming tasks. - In contrast, the cgroup freezer uses the kernel freezer code to +In contrast, the cgroup freezer uses the kernel freezer code to prevent the freeze/unfreeze cycle from becoming visible to the tasks being frozen. This allows the bash example above and gdb to run as expected. - The freezer subsystem in the container filesystem defines a file named +The freezer subsystem in the container filesystem defines a file named freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the cgroup. Subsequently writing "THAWED" will unfreeze the tasks in the cgroup. Reading will return the current state. +Note freezer.state doesn't exist in root cgroup, which means root cgroup +is non-freezable. + * Examples of usage : - # mkdir /containers/freezer + # mkdir /containers # mount -t cgroup -ofreezer freezer /containers # mkdir /containers/0 # echo $some_pid > /containers/0/tasks @@ -94,6 +97,6 @@ things happens: the freezer.state file 2) Userspace retries the freezing operation by writing "FROZEN" to the freezer.state file (writing "FREEZING" is not legal - and returns EIO) + and returns EINVAL) 3) The tasks that blocked the cgroup from entering the "FROZEN" state disappear from the cgroup's set of tasks. diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 66059071040..fb249e2bcad 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -192,6 +192,13 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) */ freezer = task_freezer(task); + /* + * The root cgroup is non-freezable, so we can skip the + * following check. + */ + if (!freezer->css.cgroup->parent) + return; + spin_lock_irq(&freezer->lock); BUG_ON(freezer->state == CGROUP_FROZEN); @@ -335,7 +342,7 @@ static int freezer_write(struct cgroup *cgroup, else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) goal_state = CGROUP_FROZEN; else - return -EIO; + return -EINVAL; if (!cgroup_lock_live_group(cgroup)) return -ENODEV; @@ -354,6 +361,8 @@ static struct cftype files[] = { static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) { + if (!cgroup->parent) + return 0; return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); } -- cgit v1.2.3-70-g09d2 From a189d0350f387786b1fb5a5d19e3a5ab0bc0cceb Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 12 Nov 2008 13:26:51 -0800 Subject: kprobes: disable preempt for module_text_address() and kernel_text_address() __register_kprobe() can be preempted after checking probing address but before module_text_address() or try_module_get(), and in this interval the module can be unloaded. In that case, try_module_get(probed_mod) will access to invalid address, or kprobe will probe invalid address. This patch uses preempt_disable() to protect it and uses __module_text_address() and __kernel_text_address(). Signed-off-by: Lai Jiangshan Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Hiroshi Shimamoto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 8b57a2597f2..f83c5e42fb0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -613,30 +613,37 @@ static int __kprobes __register_kprobe(struct kprobe *p, return -EINVAL; p->addr = addr; - if (!kernel_text_address((unsigned long) p->addr) || - in_kprobes_functions((unsigned long) p->addr)) + preempt_disable(); + if (!__kernel_text_address((unsigned long) p->addr) || + in_kprobes_functions((unsigned long) p->addr)) { + preempt_enable(); return -EINVAL; + } p->mod_refcounted = 0; /* * Check if are we probing a module. */ - probed_mod = module_text_address((unsigned long) p->addr); + probed_mod = __module_text_address((unsigned long) p->addr); if (probed_mod) { - struct module *calling_mod = module_text_address(called_from); + struct module *calling_mod; + calling_mod = __module_text_address(called_from); /* * We must allow modules to probe themself and in this case * avoid incrementing the module refcount, so as to allow * unloading of self probing modules. */ if (calling_mod && calling_mod != probed_mod) { - if (unlikely(!try_module_get(probed_mod))) + if (unlikely(!try_module_get(probed_mod))) { + preempt_enable(); return -EINVAL; + } p->mod_refcounted = 1; } else probed_mod = NULL; } + preempt_enable(); p->nmissed = 0; INIT_LIST_HEAD(&p->list); @@ -718,6 +725,10 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) struct kprobe *old_p; if (p->mod_refcounted) { + /* + * Since we've already incremented refcount, + * we don't need to disable preemption. + */ mod = module_text_address((unsigned long)p->addr); if (mod) module_put(mod); -- cgit v1.2.3-70-g09d2 From 7e036d040a28bf95255d7eb9faf0ffbba3677e99 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 12 Nov 2008 13:26:57 -0800 Subject: kernel/kprobes.c: don't pad kretprobe_table_locks[] on uniprocessor builds We only need the cacheline padding on SMP kernels. Saves 6k: text data bss dec hex filename 5713 388 8840 14941 3a5d kernel/kprobes.o 5713 388 2632 8733 221d kernel/kprobes.o Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f83c5e42fb0..9f8a3f25259 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -72,7 +72,7 @@ static bool kprobe_enabled; DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { - spinlock_t lock ____cacheline_aligned; + spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) -- cgit v1.2.3-70-g09d2 From 3ff68a6a106c362a6811d3e51bced58e6fc87de7 Mon Sep 17 00:00:00 2001 From: Mark Nelson Date: Thu, 13 Nov 2008 21:37:41 +1100 Subject: genirq: __irq_set_trigger: change pr_warning to pr_debug Commit 0c5d1eb77a8be917b638344a22afe1398236482b (genirq: record trigger type) caused powerpc platforms that had no set_type() function in their struct irq_chip to spew out warnings about "No set_type function for IRQ...". This warning isn't necessarily justified though because the generic powerpc platform code calls set_irq_type() (which in turn calls __irq_set_trigger) with information from the device tree to establish the interrupt mappings, regardless of whether the PIC can actually set a type. A platform's irq_chip might not have a set_type function for a variety of reasons, for example: the platform may have the type essentially hard-coded, or as in the case for Cell interrupts are just messages past around that have no real concept of type, or the platform could even have a virtual PIC as on the PS3. Signed-off-by: Mark Nelson Signed-off-by: Ingo Molnar --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 435861284e4..801addda3c4 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -365,7 +365,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, * IRQF_TRIGGER_* but the PIC does not support multiple * flow-types? */ - pr_warning("No set_type function for IRQ %d (%s)\n", irq, + pr_debug("No set_type function for IRQ %d (%s)\n", irq, chip ? (chip->name ? : "unknown") : "unknown"); return 0; } -- cgit v1.2.3-70-g09d2 From ee51a1de7e3837577412be269e0100038068e691 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 Nov 2008 14:58:31 +0100 Subject: tracing: fix mmiotrace resizing crash Pekka reported a crash when resizing the mmiotrace tracer (if only mmiotrace is enabled). This happens because in that case we do not allocate the max buffer, but we try to use it. Make ring_buffer_resize() idempotent against NULL buffers. Reported-by: Pekka Paalanen Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 231db209fa8..036456cbb4f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -538,6 +538,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) LIST_HEAD(pages); int i, cpu; + /* + * Always succeed at resizing a non-existent buffer: + */ + if (!buffer) + return size; + size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); size *= BUF_PAGE_SIZE; buffer_size = buffer->pages * BUF_PAGE_SIZE; -- cgit v1.2.3-70-g09d2 From 8141c7f3e7aee618312fa1c15109e1219de784a7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 15 Nov 2008 10:20:36 -0800 Subject: Move "exit_robust_list" into mm_release() We don't want to get rid of the futexes just at exit() time, we want to drop them when doing an execve() too, since that gets rid of the previous VM image too. Doing it at mm_release() time means that we automatically always do it when we disassociate a VM map from the task. Reported-by: pageexec@freemail.hu Cc: Andrew Morton Cc: Nick Piggin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Brad Spengler Cc: Alex Efros Cc: Peter Zijlstra Cc: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 9 --------- kernel/fork.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index ae2b92be5fa..2d8be7ebb0f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include /* for audit_free() */ #include @@ -1059,14 +1058,6 @@ NORET_TYPE void do_exit(long code) exit_itimers(tsk->signal); } acct_collect(code, group_dead); -#ifdef CONFIG_FUTEX - if (unlikely(tsk->robust_list)) - exit_robust_list(tsk); -#ifdef CONFIG_COMPAT - if (unlikely(tsk->compat_robust_list)) - compat_exit_robust_list(tsk); -#endif -#endif if (group_dead) tty_audit_exit(); if (unlikely(tsk->audit_context)) diff --git a/kernel/fork.c b/kernel/fork.c index f6083561dfe..2a372a0e206 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -519,6 +520,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) { struct completion *vfork_done = tsk->vfork_done; + /* Get rid of any futexes when releasing the mm */ +#ifdef CONFIG_FUTEX + if (unlikely(tsk->robust_list)) + exit_robust_list(tsk); +#ifdef CONFIG_COMPAT + if (unlikely(tsk->compat_robust_list)) + compat_exit_robust_list(tsk); +#endif +#endif + /* Get rid of any cached register state */ deactivate_mm(tsk, mm); -- cgit v1.2.3-70-g09d2 From 8f7b0ba1c853919b85b54774775f567f30006107 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 15 Nov 2008 01:15:43 +0000 Subject: Fix inotify watch removal/umount races Inotify watch removals suck violently. To kick the watch out we need (in this order) inode->inotify_mutex and ih->mutex. That's fine if we have a hold on inode; however, for all other cases we need to make damn sure we don't race with umount. We can *NOT* just grab a reference to a watch - inotify_unmount_inodes() will happily sail past it and we'll end with reference to inode potentially outliving its superblock. Ideally we just want to grab an active reference to superblock if we can; that will make sure we won't go into inotify_umount_inodes() until we are done. Cleanup is just deactivate_super(). However, that leaves a messy case - what if we *are* racing with umount() and active references to superblock can't be acquired anymore? We can bump ->s_count, grab ->s_umount, which will almost certainly wait until the superblock is shut down and the watch in question is pining for fjords. That's fine, but there is a problem - we might have hit the window between ->s_active getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock is past the point of no return and is heading for shutdown) and the moment when deactivate_super() acquires ->s_umount. We could just do drop_super() yield() and retry, but that's rather antisocial and this stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having found that we'd got there first (i.e. that ->s_root is non-NULL) we know that we won't race with inotify_umount_inodes(). So we could grab a reference to watch and do the rest as above, just with drop_super() instead of deactivate_super(), right? Wrong. We had to drop ih->mutex before we could grab ->s_umount. So the watch could've been gone already. That still can be dealt with - we need to save watch->wd, do idr_find() and compare its result with our pointer. If they match, we either have the damn thing still alive or we'd lost not one but two races at once, the watch had been killed and a new one got created with the same ->wd at the same address. That couldn't have happened in inotify_destroy(), but inotify_rm_wd() could run into that. Still, "new one got created" is not a problem - we have every right to kill it or leave it alone, whatever's more convenient. So we can use idr_find(...) == watch && watch->inode->i_sb == sb as "grab it and kill it" check. If it's been our original watch, we are fine, if it's a newcomer - nevermind, just pretend that we'd won the race and kill the fscker anyway; we are safe since we know that its superblock won't be going away. And yes, this is far beyond mere "not very pretty"; so's the entire concept of inotify to start with. Signed-off-by: Al Viro Acked-by: Greg KH Signed-off-by: Linus Torvalds --- fs/inotify.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++-- include/linux/inotify.h | 11 ++++ kernel/audit_tree.c | 91 +++++++++++++++++------------ kernel/auditfilter.c | 14 +++-- 4 files changed, 218 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/fs/inotify.c b/fs/inotify.c index 690e72595e6..7bbed1b8982 100644 --- a/fs/inotify.c +++ b/fs/inotify.c @@ -106,6 +106,20 @@ void get_inotify_watch(struct inotify_watch *watch) } EXPORT_SYMBOL_GPL(get_inotify_watch); +int pin_inotify_watch(struct inotify_watch *watch) +{ + struct super_block *sb = watch->inode->i_sb; + spin_lock(&sb_lock); + if (sb->s_count >= S_BIAS) { + atomic_inc(&sb->s_active); + spin_unlock(&sb_lock); + atomic_inc(&watch->count); + return 1; + } + spin_unlock(&sb_lock); + return 0; +} + /** * put_inotify_watch - decrements the ref count on a given watch. cleans up * watch references if the count reaches zero. inotify_watch is freed by @@ -124,6 +138,13 @@ void put_inotify_watch(struct inotify_watch *watch) } EXPORT_SYMBOL_GPL(put_inotify_watch); +void unpin_inotify_watch(struct inotify_watch *watch) +{ + struct super_block *sb = watch->inode->i_sb; + put_inotify_watch(watch); + deactivate_super(sb); +} + /* * inotify_handle_get_wd - returns the next WD for use by the given handle * @@ -479,6 +500,112 @@ void inotify_init_watch(struct inotify_watch *watch) } EXPORT_SYMBOL_GPL(inotify_init_watch); +/* + * Watch removals suck violently. To kick the watch out we need (in this + * order) inode->inotify_mutex and ih->mutex. That's fine if we have + * a hold on inode; however, for all other cases we need to make damn sure + * we don't race with umount. We can *NOT* just grab a reference to a + * watch - inotify_unmount_inodes() will happily sail past it and we'll end + * with reference to inode potentially outliving its superblock. Ideally + * we just want to grab an active reference to superblock if we can; that + * will make sure we won't go into inotify_umount_inodes() until we are + * done. Cleanup is just deactivate_super(). However, that leaves a messy + * case - what if we *are* racing with umount() and active references to + * superblock can't be acquired anymore? We can bump ->s_count, grab + * ->s_umount, which will almost certainly wait until the superblock is shut + * down and the watch in question is pining for fjords. That's fine, but + * there is a problem - we might have hit the window between ->s_active + * getting to 0 / ->s_count - below S_BIAS (i.e. the moment when superblock + * is past the point of no return and is heading for shutdown) and the + * moment when deactivate_super() acquires ->s_umount. We could just do + * drop_super() yield() and retry, but that's rather antisocial and this + * stuff is luser-triggerable. OTOH, having grabbed ->s_umount and having + * found that we'd got there first (i.e. that ->s_root is non-NULL) we know + * that we won't race with inotify_umount_inodes(). So we could grab a + * reference to watch and do the rest as above, just with drop_super() instead + * of deactivate_super(), right? Wrong. We had to drop ih->mutex before we + * could grab ->s_umount. So the watch could've been gone already. + * + * That still can be dealt with - we need to save watch->wd, do idr_find() + * and compare its result with our pointer. If they match, we either have + * the damn thing still alive or we'd lost not one but two races at once, + * the watch had been killed and a new one got created with the same ->wd + * at the same address. That couldn't have happened in inotify_destroy(), + * but inotify_rm_wd() could run into that. Still, "new one got created" + * is not a problem - we have every right to kill it or leave it alone, + * whatever's more convenient. + * + * So we can use idr_find(...) == watch && watch->inode->i_sb == sb as + * "grab it and kill it" check. If it's been our original watch, we are + * fine, if it's a newcomer - nevermind, just pretend that we'd won the + * race and kill the fscker anyway; we are safe since we know that its + * superblock won't be going away. + * + * And yes, this is far beyond mere "not very pretty"; so's the entire + * concept of inotify to start with. + */ + +/** + * pin_to_kill - pin the watch down for removal + * @ih: inotify handle + * @watch: watch to kill + * + * Called with ih->mutex held, drops it. Possible return values: + * 0 - nothing to do, it has died + * 1 - remove it, drop the reference and deactivate_super() + * 2 - remove it, drop the reference and drop_super(); we tried hard to avoid + * that variant, since it involved a lot of PITA, but that's the best that + * could've been done. + */ +static int pin_to_kill(struct inotify_handle *ih, struct inotify_watch *watch) +{ + struct super_block *sb = watch->inode->i_sb; + s32 wd = watch->wd; + + spin_lock(&sb_lock); + if (sb->s_count >= S_BIAS) { + atomic_inc(&sb->s_active); + spin_unlock(&sb_lock); + get_inotify_watch(watch); + mutex_unlock(&ih->mutex); + return 1; /* the best outcome */ + } + sb->s_count++; + spin_unlock(&sb_lock); + mutex_unlock(&ih->mutex); /* can't grab ->s_umount under it */ + down_read(&sb->s_umount); + if (likely(!sb->s_root)) { + /* fs is already shut down; the watch is dead */ + drop_super(sb); + return 0; + } + /* raced with the final deactivate_super() */ + mutex_lock(&ih->mutex); + if (idr_find(&ih->idr, wd) != watch || watch->inode->i_sb != sb) { + /* the watch is dead */ + mutex_unlock(&ih->mutex); + drop_super(sb); + return 0; + } + /* still alive or freed and reused with the same sb and wd; kill */ + get_inotify_watch(watch); + mutex_unlock(&ih->mutex); + return 2; +} + +static void unpin_and_kill(struct inotify_watch *watch, int how) +{ + struct super_block *sb = watch->inode->i_sb; + put_inotify_watch(watch); + switch (how) { + case 1: + deactivate_super(sb); + break; + case 2: + drop_super(sb); + } +} + /** * inotify_destroy - clean up and destroy an inotify instance * @ih: inotify handle @@ -490,11 +617,15 @@ void inotify_destroy(struct inotify_handle *ih) * pretty. We cannot do a simple iteration over the list, because we * do not know the inode until we iterate to the watch. But we need to * hold inode->inotify_mutex before ih->mutex. The following works. + * + * AV: it had to become even uglier to start working ;-/ */ while (1) { struct inotify_watch *watch; struct list_head *watches; + struct super_block *sb; struct inode *inode; + int how; mutex_lock(&ih->mutex); watches = &ih->watches; @@ -503,8 +634,10 @@ void inotify_destroy(struct inotify_handle *ih) break; } watch = list_first_entry(watches, struct inotify_watch, h_list); - get_inotify_watch(watch); - mutex_unlock(&ih->mutex); + sb = watch->inode->i_sb; + how = pin_to_kill(ih, watch); + if (!how) + continue; inode = watch->inode; mutex_lock(&inode->inotify_mutex); @@ -518,7 +651,7 @@ void inotify_destroy(struct inotify_handle *ih) mutex_unlock(&ih->mutex); mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(watch); + unpin_and_kill(watch, how); } /* free this handle: the put matching the get in inotify_init() */ @@ -719,7 +852,9 @@ void inotify_evict_watch(struct inotify_watch *watch) int inotify_rm_wd(struct inotify_handle *ih, u32 wd) { struct inotify_watch *watch; + struct super_block *sb; struct inode *inode; + int how; mutex_lock(&ih->mutex); watch = idr_find(&ih->idr, wd); @@ -727,9 +862,12 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd) mutex_unlock(&ih->mutex); return -EINVAL; } - get_inotify_watch(watch); + sb = watch->inode->i_sb; + how = pin_to_kill(ih, watch); + if (!how) + return 0; + inode = watch->inode; - mutex_unlock(&ih->mutex); mutex_lock(&inode->inotify_mutex); mutex_lock(&ih->mutex); @@ -740,7 +878,7 @@ int inotify_rm_wd(struct inotify_handle *ih, u32 wd) mutex_unlock(&ih->mutex); mutex_unlock(&inode->inotify_mutex); - put_inotify_watch(watch); + unpin_and_kill(watch, how); return 0; } diff --git a/include/linux/inotify.h b/include/linux/inotify.h index bd578578a8b..37ea2894b3c 100644 --- a/include/linux/inotify.h +++ b/include/linux/inotify.h @@ -134,6 +134,8 @@ extern void inotify_remove_watch_locked(struct inotify_handle *, struct inotify_watch *); extern void get_inotify_watch(struct inotify_watch *); extern void put_inotify_watch(struct inotify_watch *); +extern int pin_inotify_watch(struct inotify_watch *); +extern void unpin_inotify_watch(struct inotify_watch *); #else @@ -228,6 +230,15 @@ static inline void put_inotify_watch(struct inotify_watch *watch) { } +extern inline int pin_inotify_watch(struct inotify_watch *watch) +{ + return 0; +} + +extern inline void unpin_inotify_watch(struct inotify_watch *watch) +{ +} + #endif /* CONFIG_INOTIFY */ #endif /* __KERNEL __ */ diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 8ba0e0d934f..8b509441f49 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -24,6 +24,7 @@ struct audit_chunk { struct list_head trees; /* with root here */ int dead; int count; + atomic_long_t refs; struct rcu_head head; struct node { struct list_head list; @@ -56,7 +57,8 @@ static LIST_HEAD(prune_list); * tree is refcounted; one reference for "some rules on rules_list refer to * it", one for each chunk with pointer to it. * - * chunk is refcounted by embedded inotify_watch. + * chunk is refcounted by embedded inotify_watch + .refs (non-zero refcount + * of watch contributes 1 to .refs). * * node.index allows to get from node.list to containing chunk. * MSB of that sucker is stolen to mark taggings that we might have to @@ -121,6 +123,7 @@ static struct audit_chunk *alloc_chunk(int count) INIT_LIST_HEAD(&chunk->hash); INIT_LIST_HEAD(&chunk->trees); chunk->count = count; + atomic_long_set(&chunk->refs, 1); for (i = 0; i < count; i++) { INIT_LIST_HEAD(&chunk->owners[i].list); chunk->owners[i].index = i; @@ -129,9 +132,8 @@ static struct audit_chunk *alloc_chunk(int count) return chunk; } -static void __free_chunk(struct rcu_head *rcu) +static void free_chunk(struct audit_chunk *chunk) { - struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head); int i; for (i = 0; i < chunk->count; i++) { @@ -141,14 +143,16 @@ static void __free_chunk(struct rcu_head *rcu) kfree(chunk); } -static inline void free_chunk(struct audit_chunk *chunk) +void audit_put_chunk(struct audit_chunk *chunk) { - call_rcu(&chunk->head, __free_chunk); + if (atomic_long_dec_and_test(&chunk->refs)) + free_chunk(chunk); } -void audit_put_chunk(struct audit_chunk *chunk) +static void __put_chunk(struct rcu_head *rcu) { - put_inotify_watch(&chunk->watch); + struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head); + audit_put_chunk(chunk); } enum {HASH_SIZE = 128}; @@ -176,7 +180,7 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode) list_for_each_entry_rcu(p, list, hash) { if (p->watch.inode == inode) { - get_inotify_watch(&p->watch); + atomic_long_inc(&p->refs); return p; } } @@ -194,17 +198,49 @@ int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) /* tagging and untagging inodes with trees */ -static void untag_chunk(struct audit_chunk *chunk, struct node *p) +static struct audit_chunk *find_chunk(struct node *p) +{ + int index = p->index & ~(1U<<31); + p -= index; + return container_of(p, struct audit_chunk, owners[0]); +} + +static void untag_chunk(struct node *p) { + struct audit_chunk *chunk = find_chunk(p); struct audit_chunk *new; struct audit_tree *owner; int size = chunk->count - 1; int i, j; + if (!pin_inotify_watch(&chunk->watch)) { + /* + * Filesystem is shutting down; all watches are getting + * evicted, just take it off the node list for this + * tree and let the eviction logics take care of the + * rest. + */ + owner = p->owner; + if (owner->root == chunk) { + list_del_init(&owner->same_root); + owner->root = NULL; + } + list_del_init(&p->list); + p->owner = NULL; + put_tree(owner); + return; + } + + spin_unlock(&hash_lock); + + /* + * pin_inotify_watch() succeeded, so the watch won't go away + * from under us. + */ mutex_lock(&chunk->watch.inode->inotify_mutex); if (chunk->dead) { mutex_unlock(&chunk->watch.inode->inotify_mutex); - return; + goto out; } owner = p->owner; @@ -221,7 +257,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p) inotify_evict_watch(&chunk->watch); mutex_unlock(&chunk->watch.inode->inotify_mutex); put_inotify_watch(&chunk->watch); - return; + goto out; } new = alloc_chunk(size); @@ -263,7 +299,7 @@ static void untag_chunk(struct audit_chunk *chunk, struct node *p) inotify_evict_watch(&chunk->watch); mutex_unlock(&chunk->watch.inode->inotify_mutex); put_inotify_watch(&chunk->watch); - return; + goto out; Fallback: // do the best we can @@ -277,6 +313,9 @@ Fallback: put_tree(owner); spin_unlock(&hash_lock); mutex_unlock(&chunk->watch.inode->inotify_mutex); +out: + unpin_inotify_watch(&chunk->watch); + spin_lock(&hash_lock); } static int create_chunk(struct inode *inode, struct audit_tree *tree) @@ -387,13 +426,6 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) return 0; } -static struct audit_chunk *find_chunk(struct node *p) -{ - int index = p->index & ~(1U<<31); - p -= index; - return container_of(p, struct audit_chunk, owners[0]); -} - static void kill_rules(struct audit_tree *tree) { struct audit_krule *rule, *next; @@ -431,17 +463,10 @@ static void prune_one(struct audit_tree *victim) spin_lock(&hash_lock); while (!list_empty(&victim->chunks)) { struct node *p; - struct audit_chunk *chunk; p = list_entry(victim->chunks.next, struct node, list); - chunk = find_chunk(p); - get_inotify_watch(&chunk->watch); - spin_unlock(&hash_lock); - - untag_chunk(chunk, p); - put_inotify_watch(&chunk->watch); - spin_lock(&hash_lock); + untag_chunk(p); } spin_unlock(&hash_lock); put_tree(victim); @@ -469,7 +494,6 @@ static void trim_marked(struct audit_tree *tree) while (!list_empty(&tree->chunks)) { struct node *node; - struct audit_chunk *chunk; node = list_entry(tree->chunks.next, struct node, list); @@ -477,14 +501,7 @@ static void trim_marked(struct audit_tree *tree) if (!(node->index & (1U<<31))) break; - chunk = find_chunk(node); - get_inotify_watch(&chunk->watch); - spin_unlock(&hash_lock); - - untag_chunk(chunk, node); - - put_inotify_watch(&chunk->watch); - spin_lock(&hash_lock); + untag_chunk(node); } if (!tree->root && !tree->goner) { tree->goner = 1; @@ -878,7 +895,7 @@ static void handle_event(struct inotify_watch *watch, u32 wd, u32 mask, static void destroy_watch(struct inotify_watch *watch) { struct audit_chunk *chunk = container_of(watch, struct audit_chunk, watch); - free_chunk(chunk); + call_rcu(&chunk->head, __put_chunk); } static const struct inotify_operations rtree_inotify_ops = { diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b7d354e2b0e..9fd85a4640a 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1094,8 +1094,8 @@ static void audit_inotify_unregister(struct list_head *in_list) list_for_each_entry_safe(p, n, in_list, ilist) { list_del(&p->ilist); inotify_rm_watch(audit_ih, &p->wdata); - /* the put matching the get in audit_do_del_rule() */ - put_inotify_watch(&p->wdata); + /* the unpin matching the pin in audit_do_del_rule() */ + unpin_inotify_watch(&p->wdata); } } @@ -1389,9 +1389,13 @@ static inline int audit_del_rule(struct audit_entry *entry, /* Put parent on the inotify un-registration * list. Grab a reference before releasing * audit_filter_mutex, to be released in - * audit_inotify_unregister(). */ - list_add(&parent->ilist, &inotify_list); - get_inotify_watch(&parent->wdata); + * audit_inotify_unregister(). + * If filesystem is going away, just leave + * the sucker alone, eviction will take + * care of it. + */ + if (pin_inotify_watch(&parent->wdata)) + list_add(&parent->ilist, &inotify_list); } } } -- cgit v1.2.3-70-g09d2 From 29d7b90c15035741d15421b36000509212b3e135 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 16 Nov 2008 08:07:15 +0100 Subject: sched: fix kernel warning on /proc/sched_debug access Luis Henriques reported that with CONFIG_PREEMPT=y + CONFIG_PREEMPT_DEBUG=y + CONFIG_SCHED_DEBUG=y + CONFIG_LATENCYTOP=y enabled, the following warning triggers when using latencytop: > [ 775.663239] BUG: using smp_processor_id() in preemptible [00000000] code: latencytop/6585 > [ 775.663303] caller is native_sched_clock+0x3a/0x80 > [ 775.663314] Pid: 6585, comm: latencytop Tainted: G W 2.6.28-rc4-00355-g9c7c354 #1 > [ 775.663322] Call Trace: > [ 775.663343] [] debug_smp_processor_id+0xe4/0xf0 > [ 775.663356] [] native_sched_clock+0x3a/0x80 > [ 775.663368] [] sched_clock+0x9/0x10 > [ 775.663381] [] proc_sched_show_task+0x8bd/0x10e0 > [ 775.663395] [] sched_show+0x3e/0x80 > [ 775.663408] [] seq_read+0xdb/0x350 > [ 775.663421] [] ? security_file_permission+0x16/0x20 > [ 775.663435] [] vfs_read+0xc8/0x170 > [ 775.663447] [] sys_read+0x55/0x90 > [ 775.663460] [] system_call_fastpath+0x16/0x1b > ... This breakage was caused by me via: 7cbaef9: sched: optimize sched_clock() a bit Change the calls to cpu_clock(). Reported-by: Luis Henriques --- kernel/sched_debug.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 48ecc51e770..26ed8e3d1c1 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -423,10 +423,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #undef __P { + unsigned int this_cpu = raw_smp_processor_id(); u64 t0, t1; - t0 = sched_clock(); - t1 = sched_clock(); + t0 = cpu_clock(this_cpu); + t1 = cpu_clock(this_cpu); SEQ_printf(m, "%-35s:%21Ld\n", "clock-delta", (long long)(t1-t0)); } -- cgit v1.2.3-70-g09d2 From 5821e1b74f0d08952cb5da4bfd2d9a388d8df58e Mon Sep 17 00:00:00 2001 From: walimis Date: Sat, 15 Nov 2008 15:19:06 +0800 Subject: function tracing: fix wrong pos computing when read buffer has been fulfilled Impact: make output of available_filter_functions complete phenomenon: The first value of dyn_ftrace_total_info is not equal with `cat available_filter_functions | wc -l`, but they should be equal. root cause: When printing functions with seq_printf in t_show, if the read buffer is just overflowed by current function record, then this function won't be printed to user space through read buffer, it will just be dropped. So we can't see this function printing. So, every time the last function to fill the read buffer, if overflowed, will be dropped. This also applies to set_ftrace_filter if set_ftrace_filter has more bytes than read buffer. fix: Through checking return value of seq_printf, if less than 0, we know this function doesn't be printed. Then we decrease position to force this function to be printed next time, in next read buffer. Another little fix is to show correct allocating pages count. Signed-off-by: walimis Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 14fa52297b2..e60205722d0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -673,7 +673,7 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) cnt = num_to_init / ENTRIES_PER_PAGE; pr_info("ftrace: allocating %ld entries in %d pages\n", - num_to_init, cnt); + num_to_init, cnt + 1); for (i = 0; i < cnt; i++) { pg->next = (void *)get_zeroed_page(GFP_KERNEL); @@ -753,13 +753,11 @@ static void *t_start(struct seq_file *m, loff_t *pos) void *p = NULL; loff_t l = -1; - if (*pos != iter->pos) { - for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l)) - ; - } else { - l = *pos; - p = t_next(m, p, &l); - } + if (*pos > iter->pos) + *pos = iter->pos; + + l = *pos; + p = t_next(m, p, &l); return p; } @@ -770,15 +768,21 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { + struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec = v; char str[KSYM_SYMBOL_LEN]; + int ret = 0; if (!rec) return 0; kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - seq_printf(m, "%s\n", str); + ret = seq_printf(m, "%s\n", str); + if (ret < 0) { + iter->pos--; + iter->idx--; + } return 0; } @@ -804,7 +808,7 @@ ftrace_avail_open(struct inode *inode, struct file *file) return -ENOMEM; iter->pg = ftrace_pages_start; - iter->pos = -1; + iter->pos = 0; ret = seq_open(file, &show_ftrace_seq_ops); if (!ret) { @@ -891,7 +895,7 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; - iter->pos = -1; + iter->pos = 0; iter->flags = enable ? FTRACE_ITER_FILTER : FTRACE_ITER_NOTRACE; -- cgit v1.2.3-70-g09d2 From e14c8bf86350f6c39186a139c5c584a6111b2f01 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 17 Nov 2008 08:22:18 +1030 Subject: stop_machine: fix race with return value (fixes Bug #11989) Bug #11989: Suspend failure on NForce4-based boards due to chanes in stop_machine We should not access active.fnret outside the lock; in theory the next stop_machine could overwrite it. Signed-off-by: Rusty Russell Tested-by: "Rafael J. Wysocki" Signed-off-by: Linus Torvalds --- kernel/stop_machine.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 9bc4c00872c..24e8ceacc38 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -112,7 +112,7 @@ static int chill(void *unused) int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) { struct work_struct *sm_work; - int i; + int i, ret; /* Set up initial state. */ mutex_lock(&lock); @@ -137,8 +137,9 @@ int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) /* This will release the thread on our CPU. */ put_cpu(); flush_workqueue(stop_machine_wq); + ret = active.fnret; mutex_unlock(&lock); - return active.fnret; + return ret; } int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus) -- cgit v1.2.3-70-g09d2 From ad133ba3dc283300e5b62b5b7211d2f39fbf6ee7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 17 Nov 2008 15:39:47 +0100 Subject: sched, signals: fix the racy usage of ->signal in account_group_xxx/run_posix_cpu_timers Impact: fix potential NULL dereference Contrary to ad474caca3e2a0550b7ce0706527ad5ab389a4d4 changelog, other acct_group_xxx() helpers can be called after exit_notify() by timer tick. Thanks to Roland for pointing out this. Somehow I missed this simple fact when I read the original patch, and I am afraid I confused Frank during the discussion. Sorry. Fortunately, these helpers work with current, we can check ->exit_state to ensure that ->signal can't go away under us. Also, add the comment and compiler barrier to account_group_exec_runtime(), to make sure we load ->signal only once. Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- kernel/posix-cpu-timers.c | 7 +++++-- kernel/sched_stats.h | 15 +++++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 153dcb2639c..895337b16a2 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1308,9 +1308,10 @@ static inline int task_cputime_expired(const struct task_cputime *sample, */ static inline int fastpath_timer_check(struct task_struct *tsk) { - struct signal_struct *sig = tsk->signal; + struct signal_struct *sig; - if (unlikely(!sig)) + /* tsk == current, ensure it is safe to use ->signal/sighand */ + if (unlikely(tsk->exit_state)) return 0; if (!task_cputime_zero(&tsk->cputime_expires)) { @@ -1323,6 +1324,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) return 1; } + + sig = tsk->signal; if (!task_cputime_zero(&sig->cputime_expires)) { struct task_cputime group_sample; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index ee71bec1da6..7dbf72a2b02 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -298,9 +298,11 @@ static inline void account_group_user_time(struct task_struct *tsk, { struct signal_struct *sig; - sig = tsk->signal; - if (unlikely(!sig)) + /* tsk == current, ensure it is safe to use ->signal */ + if (unlikely(tsk->exit_state)) return; + + sig = tsk->signal; if (sig->cputime.totals) { struct task_cputime *times; @@ -325,9 +327,11 @@ static inline void account_group_system_time(struct task_struct *tsk, { struct signal_struct *sig; - sig = tsk->signal; - if (unlikely(!sig)) + /* tsk == current, ensure it is safe to use ->signal */ + if (unlikely(tsk->exit_state)) return; + + sig = tsk->signal; if (sig->cputime.totals) { struct task_cputime *times; @@ -353,8 +357,11 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, struct signal_struct *sig; sig = tsk->signal; + /* see __exit_signal()->task_rq_unlock_wait() */ + barrier(); if (unlikely(!sig)) return; + if (sig->cputime.totals) { struct task_cputime *times; -- cgit v1.2.3-70-g09d2 From 65ecc14a30ad21bed9aabdfd6a2ae1a1aaaa6a00 Mon Sep 17 00:00:00 2001 From: Kumar Gala Date: Sat, 15 Nov 2008 12:02:34 -0600 Subject: Remove -mno-spe flags as they dont belong For some unknown reason at Steven Rostedt added in disabling of the SPE instruction generation for e500 based PPC cores in commit 6ec562328fda585be2d7f472cfac99d3b44d362a. We are removing it because: 1. It generates e500 kernels that don't work 2. its not the correct set of flags to do this 3. we handle this in the arch/powerpc/Makefile already 4. its unknown in talking to Steven why he did this Signed-off-by: Kumar Gala Tested-and-Acked-by: Steven Rostedt Signed-off-by: Linus Torvalds --- kernel/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 9a3ec66a9d8..19fad003b19 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,8 +11,6 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o -CFLAGS_REMOVE_sched.o = -mno-spe - ifdef CONFIG_FUNCTION_TRACER # Do not trace debug files and internal ftrace files CFLAGS_REMOVE_lockdep.o = -pg @@ -21,7 +19,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg -CFLAGS_REMOVE_sched.o = -mno-spe -pg +CFLAGS_REMOVE_sched.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o -- cgit v1.2.3-70-g09d2 From 700018e0a77b4113172257fcdaa1c58e27a5074f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 18 Nov 2008 14:02:03 +0800 Subject: cpuset: fix regression when failed to generate sched domains Impact: properly rebuild sched-domains on kmalloc() failure When cpuset failed to generate sched domains due to kmalloc() failure, the scheduler should fallback to the single partition 'fallback_doms' and rebuild sched domains, but now it only destroys but not rebuilds sched domains. The regression was introduced by: | commit dfb512ec4834116124da61d6c1ee10fd0aa32bd6 | Author: Max Krasnyansky | Date: Fri Aug 29 13:11:41 2008 -0700 | | sched: arch_reinit_sched_domains() must destroy domains to force rebuild After the above commit, partition_sched_domains(0, NULL, NULL) will only destroy sched domains and partition_sched_domains(1, NULL, NULL) will create the default sched domain. Signed-off-by: Li Zefan Cc: Max Krasnyansky Cc: Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 12 ++++++++---- kernel/sched.c | 13 +++++++------ 2 files changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e00526f52e..81fc6791a29 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -587,7 +587,6 @@ static int generate_sched_domains(cpumask_t **domains, int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ - ndoms = 0; doms = NULL; dattr = NULL; csa = NULL; @@ -674,10 +673,8 @@ restart: * Convert to and populate cpu masks. */ doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); - if (!doms) { - ndoms = 0; + if (!doms) goto done; - } /* * The rest of the code, including the scheduler, can deal with @@ -732,6 +729,13 @@ restart: done: kfree(csa); + /* + * Fallback to the default domain if kmalloc() failed. + * See comments in partition_sched_domains(). + */ + if (doms == NULL) + ndoms = 1; + *domains = doms; *attributes = dattr; return ndoms; diff --git a/kernel/sched.c b/kernel/sched.c index c94baf2969e..9b1e79371c2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7789,13 +7789,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * * The passed in 'doms_new' should be kmalloc'd. This routine takes * ownership of it and will kfree it when done with it. If the caller - * failed the kmalloc call, then it can pass in doms_new == NULL, - * and partition_sched_domains() will fallback to the single partition - * 'fallback_doms', it also forces the domains to be rebuilt. + * failed the kmalloc call, then it can pass in doms_new == NULL && + * ndoms_new == 1, and partition_sched_domains() will fallback to + * the single partition 'fallback_doms', it also forces the domains + * to be rebuilt. * - * If doms_new==NULL it will be replaced with cpu_online_map. - * ndoms_new==0 is a special case for destroying existing domains. - * It will not create the default domain. + * If doms_new == NULL it will be replaced with cpu_online_map. + * ndoms_new == 0 is a special case for destroying existing domains, + * and it will not create the default domain. * * Call with hotplug lock held */ -- cgit v1.2.3-70-g09d2 From e270219f4372b58bd3eeac12bd9f7edc592b8f6b Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Tue, 18 Nov 2008 10:15:24 +0600 Subject: kernel/profile.c: fix section mismatch warning Impact: fix section mismatch warning in kernel/profile.c Here, profile_nop function has been called from a non-init function create_hash_tables(void). Which generetes a section mismatch warning. Previously, create_hash_tables(void) was a init function. So, removing __init from create_hash_tables(void) requires profile_nop to be non-init. This patch makes profile_nop function inline and fixes the following warning: WARNING: vmlinux.o(.text+0x6ebb6): Section mismatch in reference from the function create_hash_tables() to the function .init.text:profile_nop() The function create_hash_tables() references the function __init profile_nop(). This is often because create_hash_tables lacks a __init annotation or the annotation of profile_nop is wrong. Signed-off-by: Rakib Mullick Signed-off-by: Ingo Molnar --- kernel/profile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/profile.c b/kernel/profile.c index 9830a037d8d..5b7d1ac7124 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -544,7 +544,7 @@ static const struct file_operations proc_profile_operations = { }; #ifdef CONFIG_SMP -static void __init profile_nop(void *unused) +static inline void profile_nop(void *unused) { } -- cgit v1.2.3-70-g09d2 From 98ba4031ab2adc8b394295e68aa4c8fe9d5060db Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 14 Nov 2008 10:44:59 +0100 Subject: relay: fix cpu offline problem relay_open() will close allocated buffers when failed. but if cpu offlined, some buffer will not be closed. this patch fixed it. and did cleanup for relay_reset() too. Signed-off-by: Lai Jiangshan Signed-off-by: Jens Axboe --- kernel/relay.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 8d13a7855c0..32b0befdcb6 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -400,7 +400,7 @@ void relay_reset(struct rchan *chan) } mutex_lock(&relay_channels_mutex); - for_each_online_cpu(i) + for_each_possible_cpu(i) if (chan->buf[i]) __relay_reset(chan->buf[i], 0); mutex_unlock(&relay_channels_mutex); @@ -611,10 +611,9 @@ struct rchan *relay_open(const char *base_filename, return chan; free_bufs: - for_each_online_cpu(i) { - if (!chan->buf[i]) - break; - relay_close_buf(chan->buf[i]); + for_each_possible_cpu(i) { + if (chan->buf[i]) + relay_close_buf(chan->buf[i]); } kref_put(&chan->kref, relay_destroy_channel); -- cgit v1.2.3-70-g09d2 From 0bb943c7a2136716757a263f604d26309fd98042 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 14 Nov 2008 19:05:31 +0100 Subject: tracing: kernel/trace/trace.c: introduce missing kfree() Impact: fix memory leak Error handling code following a kzalloc should free the allocated data. The semantic match that finds the problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; statement S; expression E; identifier f,l; position p1,p2; expression *ptr != NULL; @@ ( if ((x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...)) == NULL) S | x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...); ... if (x == NULL) S ) <... when != x when != if (...) { <+...x...+> } x->f = E ...> ( return \(0\|<+...x...+>\|ptr\); | return@p2 ...; ) @script:python@ p1 << r.p1; p2 << r.p2; @@ print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line) // Signed-off-by: Julia Lawall Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 697eda36b86..d86e3252f30 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1936,6 +1936,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret) ring_buffer_read_finish(iter->buffer_iter[cpu]); } mutex_unlock(&trace_types_lock); + kfree(iter); return ERR_PTR(-ENOMEM); } -- cgit v1.2.3-70-g09d2 From a6a0c4ca7edb378a8a7332501f097089cb1051c4 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 18 Nov 2008 06:56:51 -0800 Subject: suspend: use WARN not WARN_ON to print the message By using WARN(), kerneloops.org can collect which component is causing the delay and make statistics about that. suspend_test_finish() is currently the number 2 item but unless we can collect who's causing it we're not going to be able to fix the hot topic ones.. Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 19122cf6d82..b8f7ce9473e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -174,7 +174,7 @@ static void suspend_test_finish(const char *label) * has some performance issues. The stack dump of a WARN_ON * is more likely to get the right attention than a printk... */ - WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000)); + WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); } #else -- cgit v1.2.3-70-g09d2 From 641d2f63cfe24539e154efa2f932937934c27dde Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 18 Nov 2008 19:22:13 +0100 Subject: trace: introduce missing mutex_unlock() Impact: fix tracing buffer mutex leak in case of allocation failure This error was spotted by this semantic patch: http://www.emn.fr/x-info/coccinelle/mut.html It looks correct as far as I can tell. Please review. Signed-off-by: Vegard Nossum Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 036456cbb4f..f780e9552f9 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -617,6 +617,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) list_del_init(&page->list); free_buffer_page(page); } + mutex_unlock(&buffer->mutex); return -ENOMEM; } -- cgit v1.2.3-70-g09d2 From f10ed36ec1118c6f9523cd7e53cb0aadb53efe9f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 7 Nov 2008 22:36:02 -0500 Subject: ftrace: fix set_ftrace_filter Impact: fix of output of set_ftrace_filter The commit "ftrace: do not show freed records in available_filter_functions" Removed a bit too much from the set_ftrace_filter code, where we now see all functions in the set_ftrace_filter file even when we set a filter. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4a39d24568c..dcac7418f68 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -738,6 +738,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) ((iter->flags & FTRACE_ITER_FAILURES) && !(rec->flags & FTRACE_FL_FAILED)) || + ((iter->flags & FTRACE_ITER_FILTER) && + !(rec->flags & FTRACE_FL_FILTER)) || + ((iter->flags & FTRACE_ITER_NOTRACE) && !(rec->flags & FTRACE_FL_NOTRACE))) { rec = NULL; -- cgit v1.2.3-70-g09d2 From 820432783190b4096499e38a4a4d7095c511913d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 18 Nov 2008 23:57:14 -0500 Subject: ftrace: make filtered functions effective on setting Impact: fix filter selection to apply when set It can be confusing when the set_filter_functions is set (or cleared) and the functions being recorded by the dynamic tracer does not match. This patch causes the code to be updated if the function tracer is enabled and the filter is changed. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index dcac7418f68..5cbddb59e99 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1189,7 +1189,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable) mutex_lock(&ftrace_sysctl_lock); mutex_lock(&ftrace_start_lock); - if (iter->filtered && ftrace_start && ftrace_enabled) + if (ftrace_start && ftrace_enabled) ftrace_run_update_code(FTRACE_ENABLE_CALLS); mutex_unlock(&ftrace_start_lock); mutex_unlock(&ftrace_sysctl_lock); -- cgit v1.2.3-70-g09d2 From 32464779a1b8c15e9aa9aa0306b2f735080df9d8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 18 Nov 2008 20:33:02 -0500 Subject: ftrace: fix dyn ftrace filter selection Impact: clean up and fix for dyn ftrace filter selection The previous logic of the dynamic ftrace selection of enabling or disabling functions was complex and incorrect. This patch simplifies the code and corrects the usage. This simplification also makes the code more robust. Here is the correct logic: Given a function that can be traced by dynamic ftrace: If the function is not to be traced, disable it if it was enabled. (this is if the function is in the set_ftrace_notrace file) (filter is on if there exists any functions in set_ftrace_filter file) If the filter is on, and we are enabling functions: If the function is in set_ftrace_filter, enable it if it is not already enabled. If the function is not in set_ftrace_filter, disable it if it is not already disabled. Otherwise, if the filter is off and we are enabling function tracing: Enable the function if it is not already enabled. Otherwise, if we are disabling function tracing: Disable the function if it is not already disabled. This code now sets or clears the ENABLED flag in the record, and at the end it will enable the function if the flag is set, or disable the function if the flag is cleared. The parameters for the function that does the above logic is also simplified. Instead of passing in confusing "new" and "old" where they might be swapped if the "enabled" flag is not set. The old logic even had one of the above always NULL and had to be filled in. The new logic simply passes in one parameter called "nop". A "call" is calculated in the code, and at the end of the logic, when we know we need to either disable or enable the function, we can then use the "nop" and "call" properly. This code is more robust than the previous version. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 108 +++++++++++++++++++++++--------------------------- 1 file changed, 50 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5cbddb59e99..fdaab04a028 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -327,96 +327,89 @@ ftrace_record_ip(unsigned long ip) static int __ftrace_replace_code(struct dyn_ftrace *rec, - unsigned char *old, unsigned char *new, int enable) + unsigned char *nop, int enable) { unsigned long ip, fl; + unsigned char *call, *old, *new; ip = rec->ip; - if (ftrace_filtered && enable) { + /* + * If this record is not to be traced and + * it is not enabled then do nothing. + * + * If this record is not to be traced and + * it is enabled then disabled it. + * + */ + if (rec->flags & FTRACE_FL_NOTRACE) { + if (rec->flags & FTRACE_FL_ENABLED) + rec->flags &= ~FTRACE_FL_ENABLED; + else + return 0; + + } else if (ftrace_filtered && enable) { /* - * If filtering is on: - * - * If this record is set to be filtered and - * is enabled then do nothing. - * - * If this record is set to be filtered and - * it is not enabled, enable it. - * - * If this record is not set to be filtered - * and it is not enabled do nothing. - * - * If this record is set not to trace then - * do nothing. - * - * If this record is set not to trace and - * it is enabled then disable it. - * - * If this record is not set to be filtered and - * it is enabled, disable it. + * Filtering is on: */ - fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE | - FTRACE_FL_ENABLED); + fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_ENABLED); - if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) || - (fl == (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) || - !fl || (fl == FTRACE_FL_NOTRACE)) + /* Record is filtered and enabled, do nothing */ + if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) return 0; - /* - * If it is enabled disable it, - * otherwise enable it! - */ - if (fl & FTRACE_FL_ENABLED) { - /* swap new and old */ - new = old; - old = ftrace_call_replace(ip, FTRACE_ADDR); + /* Record is not filtered and is not enabled do nothing */ + if (!fl) + return 0; + + /* Record is not filtered but enabled, disable it */ + if (fl == FTRACE_FL_ENABLED) rec->flags &= ~FTRACE_FL_ENABLED; - } else { - new = ftrace_call_replace(ip, FTRACE_ADDR); + else + /* Otherwise record is filtered but not enabled, enable it */ rec->flags |= FTRACE_FL_ENABLED; - } } else { + /* Disable or not filtered */ if (enable) { - /* - * If this record is set not to trace and is - * not enabled, do nothing. - */ - fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED); - if (fl == FTRACE_FL_NOTRACE) - return 0; - - new = ftrace_call_replace(ip, FTRACE_ADDR); - } else - old = ftrace_call_replace(ip, FTRACE_ADDR); - - if (enable) { + /* if record is enabled, do nothing */ if (rec->flags & FTRACE_FL_ENABLED) return 0; + rec->flags |= FTRACE_FL_ENABLED; + } else { + + /* if record is not enabled do nothing */ if (!(rec->flags & FTRACE_FL_ENABLED)) return 0; + rec->flags &= ~FTRACE_FL_ENABLED; } } + call = ftrace_call_replace(ip, FTRACE_ADDR); + + if (rec->flags & FTRACE_FL_ENABLED) { + old = nop; + new = call; + } else { + old = call; + new = nop; + } + return ftrace_modify_code(ip, old, new); } static void ftrace_replace_code(int enable) { int i, failed; - unsigned char *new = NULL, *old = NULL; + unsigned char *nop = NULL; struct dyn_ftrace *rec; struct ftrace_page *pg; - if (enable) - old = ftrace_nop_replace(); - else - new = ftrace_nop_replace(); + nop = ftrace_nop_replace(); for (pg = ftrace_pages_start; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { @@ -434,7 +427,7 @@ static void ftrace_replace_code(int enable) unfreeze_record(rec); } - failed = __ftrace_replace_code(rec, old, new, enable); + failed = __ftrace_replace_code(rec, nop, enable); if (failed && (rec->flags & FTRACE_FL_CONVERTED)) { rec->flags |= FTRACE_FL_FAILED; if ((system_state == SYSTEM_BOOTING) || @@ -538,8 +531,7 @@ static void ftrace_startup(void) mutex_lock(&ftrace_start_lock); ftrace_start++; - if (ftrace_start == 1) - command |= FTRACE_ENABLE_CALLS; + command |= FTRACE_ENABLE_CALLS; if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; -- cgit v1.2.3-70-g09d2 From de11defebf00007677fb7ee91d9b089b78786fbb Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 19 Nov 2008 15:36:14 -0800 Subject: reintroduce accept4 Introduce a new accept4() system call. The addition of this system call matches analogous changes in 2.6.27 (dup3(), evenfd2(), signalfd4(), inotify_init1(), epoll_create1(), pipe2()) which added new system calls that differed from analogous traditional system calls in adding a flags argument that can be used to access additional functionality. The accept4() system call is exactly the same as accept(), except that it adds a flags bit-mask argument. Two flags are initially implemented. (Most of the new system calls in 2.6.27 also had both of these flags.) SOCK_CLOEXEC causes the close-on-exec (FD_CLOEXEC) flag to be enabled for the new file descriptor returned by accept4(). This is a useful security feature to avoid leaking information in a multithreaded program where one thread is doing an accept() at the same time as another thread is doing a fork() plus exec(). More details here: http://udrepper.livejournal.com/20407.html "Secure File Descriptor Handling", Ulrich Drepper). The other flag is SOCK_NONBLOCK, which causes the O_NONBLOCK flag to be enabled on the new open file description created by accept4(). (This flag is merely a convenience, saving the use of additional calls fcntl(F_GETFL) and fcntl (F_SETFL) to achieve the same result. Here's a test program. Works on x86-32. Should work on x86-64, but I (mtk) don't have a system to hand to test with. It tests accept4() with each of the four possible combinations of SOCK_CLOEXEC and SOCK_NONBLOCK set/clear in 'flags', and verifies that the appropriate flags are set on the file descriptor/open file description returned by accept4(). I tested Ulrich's patch in this thread by applying against 2.6.28-rc2, and it passes according to my test program. /* test_accept4.c Copyright (C) 2008, Linux Foundation, written by Michael Kerrisk Licensed under the GNU GPLv2 or later. */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #define PORT_NUM 33333 #define die(msg) do { perror(msg); exit(EXIT_FAILURE); } while (0) /**********************************************************************/ /* The following is what we need until glibc gets a wrapper for accept4() */ /* Flags for socket(), socketpair(), accept4() */ #ifndef SOCK_CLOEXEC #define SOCK_CLOEXEC O_CLOEXEC #endif #ifndef SOCK_NONBLOCK #define SOCK_NONBLOCK O_NONBLOCK #endif #ifdef __x86_64__ #define SYS_accept4 288 #elif __i386__ #define USE_SOCKETCALL 1 #define SYS_ACCEPT4 18 #else #error "Sorry -- don't know the syscall # on this architecture" #endif static int accept4(int fd, struct sockaddr *sockaddr, socklen_t *addrlen, int flags) { printf("Calling accept4(): flags = %x", flags); if (flags != 0) { printf(" ("); if (flags & SOCK_CLOEXEC) printf("SOCK_CLOEXEC"); if ((flags & SOCK_CLOEXEC) && (flags & SOCK_NONBLOCK)) printf(" "); if (flags & SOCK_NONBLOCK) printf("SOCK_NONBLOCK"); printf(")"); } printf("\n"); #if USE_SOCKETCALL long args[6]; args[0] = fd; args[1] = (long) sockaddr; args[2] = (long) addrlen; args[3] = flags; return syscall(SYS_socketcall, SYS_ACCEPT4, args); #else return syscall(SYS_accept4, fd, sockaddr, addrlen, flags); #endif } /**********************************************************************/ static int do_test(int lfd, struct sockaddr_in *conn_addr, int closeonexec_flag, int nonblock_flag) { int connfd, acceptfd; int fdf, flf, fdf_pass, flf_pass; struct sockaddr_in claddr; socklen_t addrlen; printf("=======================================\n"); connfd = socket(AF_INET, SOCK_STREAM, 0); if (connfd == -1) die("socket"); if (connect(connfd, (struct sockaddr *) conn_addr, sizeof(struct sockaddr_in)) == -1) die("connect"); addrlen = sizeof(struct sockaddr_in); acceptfd = accept4(lfd, (struct sockaddr *) &claddr, &addrlen, closeonexec_flag | nonblock_flag); if (acceptfd == -1) { perror("accept4()"); close(connfd); return 0; } fdf = fcntl(acceptfd, F_GETFD); if (fdf == -1) die("fcntl:F_GETFD"); fdf_pass = ((fdf & FD_CLOEXEC) != 0) == ((closeonexec_flag & SOCK_CLOEXEC) != 0); printf("Close-on-exec flag is %sset (%s); ", (fdf & FD_CLOEXEC) ? "" : "not ", fdf_pass ? "OK" : "failed"); flf = fcntl(acceptfd, F_GETFL); if (flf == -1) die("fcntl:F_GETFD"); flf_pass = ((flf & O_NONBLOCK) != 0) == ((nonblock_flag & SOCK_NONBLOCK) !=0); printf("nonblock flag is %sset (%s)\n", (flf & O_NONBLOCK) ? "" : "not ", flf_pass ? "OK" : "failed"); close(acceptfd); close(connfd); printf("Test result: %s\n", (fdf_pass && flf_pass) ? "PASS" : "FAIL"); return fdf_pass && flf_pass; } static int create_listening_socket(int port_num) { struct sockaddr_in svaddr; int lfd; int optval; memset(&svaddr, 0, sizeof(struct sockaddr_in)); svaddr.sin_family = AF_INET; svaddr.sin_addr.s_addr = htonl(INADDR_ANY); svaddr.sin_port = htons(port_num); lfd = socket(AF_INET, SOCK_STREAM, 0); if (lfd == -1) die("socket"); optval = 1; if (setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)) == -1) die("setsockopt"); if (bind(lfd, (struct sockaddr *) &svaddr, sizeof(struct sockaddr_in)) == -1) die("bind"); if (listen(lfd, 5) == -1) die("listen"); return lfd; } int main(int argc, char *argv[]) { struct sockaddr_in conn_addr; int lfd; int port_num; int passed; passed = 1; port_num = (argc > 1) ? atoi(argv[1]) : PORT_NUM; memset(&conn_addr, 0, sizeof(struct sockaddr_in)); conn_addr.sin_family = AF_INET; conn_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK); conn_addr.sin_port = htons(port_num); lfd = create_listening_socket(port_num); if (!do_test(lfd, &conn_addr, 0, 0)) passed = 0; if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, 0)) passed = 0; if (!do_test(lfd, &conn_addr, 0, SOCK_NONBLOCK)) passed = 0; if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, SOCK_NONBLOCK)) passed = 0; close(lfd); exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); } [mtk.manpages@gmail.com: rewrote changelog, updated test program] Signed-off-by: Ulrich Drepper Tested-by: Michael Kerrisk Acked-by: Michael Kerrisk Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/unistd_64.h | 4 +- include/linux/net.h | 6 +-- include/linux/syscalls.h | 3 +- kernel/sys_ni.c | 2 +- net/compat.c | 50 +++---------------------- net/socket.c | 80 +++++----------------------------------- 6 files changed, 21 insertions(+), 124 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 834b2c1d89f..d2e415e6666 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -639,8 +639,8 @@ __SYSCALL(__NR_fallocate, sys_fallocate) __SYSCALL(__NR_timerfd_settime, sys_timerfd_settime) #define __NR_timerfd_gettime 287 __SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime) -#define __NR_paccept 288 -__SYSCALL(__NR_paccept, sys_paccept) +#define __NR_accept4 288 +__SYSCALL(__NR_accept4, sys_accept4) #define __NR_signalfd4 289 __SYSCALL(__NR_signalfd4, sys_signalfd4) #define __NR_eventfd2 290 diff --git a/include/linux/net.h b/include/linux/net.h index 6dc14a24004..4515efae4c3 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -40,7 +40,7 @@ #define SYS_GETSOCKOPT 15 /* sys_getsockopt(2) */ #define SYS_SENDMSG 16 /* sys_sendmsg(2) */ #define SYS_RECVMSG 17 /* sys_recvmsg(2) */ -#define SYS_PACCEPT 18 /* sys_paccept(2) */ +#define SYS_ACCEPT4 18 /* sys_accept4(2) */ typedef enum { SS_FREE = 0, /* not allocated */ @@ -100,7 +100,7 @@ enum sock_type { * remaining bits are used as flags. */ #define SOCK_TYPE_MASK 0xf -/* Flags for socket, socketpair, paccept */ +/* Flags for socket, socketpair, accept4 */ #define SOCK_CLOEXEC O_CLOEXEC #ifndef SOCK_NONBLOCK #define SOCK_NONBLOCK O_NONBLOCK @@ -223,8 +223,6 @@ extern int sock_map_fd(struct socket *sock, int flags); extern struct socket *sockfd_lookup(int fd, int *err); #define sockfd_put(sock) fput(sock->file) extern int net_ratelimit(void); -extern long do_accept(int fd, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, int flags); #define net_random() random32() #define net_srandom(seed) srandom32((__force u32)seed) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index d6ff145919c..04fb47bfb92 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -410,8 +410,7 @@ asmlinkage long sys_getsockopt(int fd, int level, int optname, asmlinkage long sys_bind(int, struct sockaddr __user *, int); asmlinkage long sys_connect(int, struct sockaddr __user *, int); asmlinkage long sys_accept(int, struct sockaddr __user *, int __user *); -asmlinkage long sys_paccept(int, struct sockaddr __user *, int __user *, - const __user sigset_t *, size_t, int); +asmlinkage long sys_accept4(int, struct sockaddr __user *, int __user *, int); asmlinkage long sys_getsockname(int, struct sockaddr __user *, int __user *); asmlinkage long sys_getpeername(int, struct sockaddr __user *, int __user *); asmlinkage long sys_send(int, void __user *, size_t, unsigned); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a77b27b11b0..e14a2328170 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -31,7 +31,7 @@ cond_syscall(sys_socketpair); cond_syscall(sys_bind); cond_syscall(sys_listen); cond_syscall(sys_accept); -cond_syscall(sys_paccept); +cond_syscall(sys_accept4); cond_syscall(sys_connect); cond_syscall(sys_getsockname); cond_syscall(sys_getpeername); diff --git a/net/compat.c b/net/compat.c index 6ce1a1cadcc..a3a2ba0fac0 100644 --- a/net/compat.c +++ b/net/compat.c @@ -725,7 +725,7 @@ EXPORT_SYMBOL(compat_mc_getsockopt); static unsigned char nas[19]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3), AL(3),AL(3),AL(4),AL(4),AL(4),AL(6), AL(6),AL(2),AL(5),AL(5),AL(3),AL(3), - AL(6)}; + AL(4)}; #undef AL asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags) @@ -738,52 +738,13 @@ asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, uns return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); } -asmlinkage long compat_sys_paccept(int fd, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, - const compat_sigset_t __user *sigmask, - compat_size_t sigsetsize, int flags) -{ - compat_sigset_t ss32; - sigset_t ksigmask, sigsaved; - int ret; - - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) - return -EFAULT; - sigset_from_compat(&ksigmask, &ss32); - - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } - - ret = do_accept(fd, upeer_sockaddr, upeer_addrlen, flags); - - if (ret == -ERESTARTNOHAND) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - - return ret; -} - asmlinkage long compat_sys_socketcall(int call, u32 __user *args) { int ret; u32 a[6]; u32 a0, a1; - if (call < SYS_SOCKET || call > SYS_PACCEPT) + if (call < SYS_SOCKET || call > SYS_ACCEPT4) return -EINVAL; if (copy_from_user(a, args, nas[call])) return -EFAULT; @@ -804,7 +765,7 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args) ret = sys_listen(a0, a1); break; case SYS_ACCEPT: - ret = do_accept(a0, compat_ptr(a1), compat_ptr(a[2]), 0); + ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0); break; case SYS_GETSOCKNAME: ret = sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2])); @@ -844,9 +805,8 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args) case SYS_RECVMSG: ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]); break; - case SYS_PACCEPT: - ret = compat_sys_paccept(a0, compat_ptr(a1), compat_ptr(a[2]), - compat_ptr(a[3]), a[4], a[5]); + case SYS_ACCEPT4: + ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]); break; default: ret = -EINVAL; diff --git a/net/socket.c b/net/socket.c index 57550c3bcab..92764d83689 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1426,8 +1426,8 @@ asmlinkage long sys_listen(int fd, int backlog) * clean when we restucture accept also. */ -long do_accept(int fd, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, int flags) +asmlinkage long sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, + int __user *upeer_addrlen, int flags) { struct socket *sock, *newsock; struct file *newfile; @@ -1510,66 +1510,10 @@ out_fd: goto out_put; } -#if 0 -#ifdef HAVE_SET_RESTORE_SIGMASK -asmlinkage long sys_paccept(int fd, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, - const sigset_t __user *sigmask, - size_t sigsetsize, int flags) -{ - sigset_t ksigmask, sigsaved; - int ret; - - if (sigmask) { - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) - return -EFAULT; - - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } - - ret = do_accept(fd, upeer_sockaddr, upeer_addrlen, flags); - - if (ret < 0 && signal_pending(current)) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - - return ret; -} -#else -asmlinkage long sys_paccept(int fd, struct sockaddr __user *upeer_sockaddr, - int __user *upeer_addrlen, - const sigset_t __user *sigmask, - size_t sigsetsize, int flags) -{ - /* The platform does not support restoring the signal mask in the - * return path. So we do not allow using paccept() with a signal - * mask. */ - if (sigmask) - return -EINVAL; - - return do_accept(fd, upeer_sockaddr, upeer_addrlen, flags); -} -#endif -#endif - asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen) { - return do_accept(fd, upeer_sockaddr, upeer_addrlen, 0); + return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0); } /* @@ -2096,7 +2040,7 @@ static const unsigned char nargs[19]={ AL(0),AL(3),AL(3),AL(3),AL(2),AL(3), AL(3),AL(3),AL(4),AL(4),AL(4),AL(6), AL(6),AL(2),AL(5),AL(5),AL(3),AL(3), - AL(6) + AL(4) }; #undef AL @@ -2115,7 +2059,7 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args) unsigned long a0, a1; int err; - if (call < 1 || call > SYS_PACCEPT) + if (call < 1 || call > SYS_ACCEPT4) return -EINVAL; /* copy_from_user should be SMP safe. */ @@ -2143,9 +2087,8 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args) err = sys_listen(a0, a1); break; case SYS_ACCEPT: - err = - do_accept(a0, (struct sockaddr __user *)a1, - (int __user *)a[2], 0); + err = sys_accept4(a0, (struct sockaddr __user *)a1, + (int __user *)a[2], 0); break; case SYS_GETSOCKNAME: err = @@ -2192,12 +2135,9 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args) case SYS_RECVMSG: err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]); break; - case SYS_PACCEPT: - err = - sys_paccept(a0, (struct sockaddr __user *)a1, - (int __user *)a[2], - (const sigset_t __user *) a[3], - a[4], a[5]); + case SYS_ACCEPT4: + err = sys_accept4(a0, (struct sockaddr __user *)a1, + (int __user *)a[2], a[3]); break; default: err = -EINVAL; -- cgit v1.2.3-70-g09d2 From f481891fdc49d3d1b8a9674a1825d183069a805f Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 19 Nov 2008 15:36:30 -0800 Subject: cpuset: update top cpuset's mems after adding a node After adding a node into the machine, top cpuset's mems isn't updated. By reviewing the code, we found that the update function cpuset_track_online_nodes() was invoked after node_states[N_ONLINE] changes. It is wrong because N_ONLINE just means node has pgdat, and if node has/added memory, we use N_HIGH_MEMORY. So, We should invoke the update function after node_states[N_HIGH_MEMORY] changes, just like its commit says. This patch fixes it. And we use notifier of memory hotplug instead of direct calling of cpuset_track_online_nodes(). Signed-off-by: Miao Xie Acked-by: Yasunori Goto Cc: David Rientjes Cc: Paul Menage Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 4 ---- kernel/cpuset.c | 19 ++++++++++++++++--- mm/memory_hotplug.c | 3 --- 3 files changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 2691926fb50..8e540d32c9f 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -74,8 +74,6 @@ static inline int cpuset_do_slab_mem_spread(void) return current->flags & PF_SPREAD_SLAB; } -extern void cpuset_track_online_nodes(void); - extern int current_cpuset_is_being_rebound(void); extern void rebuild_sched_domains(void); @@ -151,8 +149,6 @@ static inline int cpuset_do_slab_mem_spread(void) return 0; } -static inline void cpuset_track_online_nodes(void) {} - static inline int current_cpuset_is_being_rebound(void) { return 0; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 81fc6791a29..da7ff6137f3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -2015,12 +2016,23 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. * See also the previous routine cpuset_track_online_cpus(). */ -void cpuset_track_online_nodes(void) +static int cpuset_track_online_nodes(struct notifier_block *self, + unsigned long action, void *arg) { cgroup_lock(); - top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - scan_for_empty_cpusets(&top_cpuset); + switch (action) { + case MEM_ONLINE: + top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + break; + case MEM_OFFLINE: + top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + scan_for_empty_cpusets(&top_cpuset); + break; + default: + break; + } cgroup_unlock(); + return NOTIFY_OK; } #endif @@ -2036,6 +2048,7 @@ void __init cpuset_init_smp(void) top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; hotcpu_notifier(cpuset_track_online_cpus, 0); + hotplug_memory_notifier(cpuset_track_online_nodes, 10); } /** diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6837a101437..b5b2b15085a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -498,8 +497,6 @@ int add_memory(int nid, u64 start, u64 size) /* we online node here. we can't roll back from here. */ node_set_online(nid); - cpuset_track_online_nodes(); - if (new_pgdat) { ret = register_one_node(nid); /* -- cgit v1.2.3-70-g09d2 From 3fa59dfbc3b223f02c26593be69ce6fc9a940405 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 19 Nov 2008 15:36:34 -0800 Subject: cgroup: fix potential deadlock in pre_destroy As Balbir pointed out, memcg's pre_destroy handler has potential deadlock. It has following lock sequence. cgroup_mutex (cgroup_rmdir) -> pre_destroy -> mem_cgroup_pre_destroy-> force_empty -> cpu_hotplug.lock. (lru_add_drain_all-> schedule_work-> get_online_cpus) But, cpuset has following. cpu_hotplug.lock (call notifier) -> cgroup_mutex. (within notifier) Then, this lock sequence should be fixed. Considering how pre_destroy works, it's not necessary to holding cgroup_mutex() while calling it. As a side effect, we don't have to wait at this mutex while memcg's force_empty works.(it can be long when there are tons of pages.) Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Li Zefan Cc: Paul Menage Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 358e77564e6..1a06be61dcd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2472,10 +2472,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) mutex_unlock(&cgroup_mutex); return -EBUSY; } - - parent = cgrp->parent; - root = cgrp->root; - sb = root->sb; + mutex_unlock(&cgroup_mutex); /* * Call pre_destroy handlers of subsys. Notify subsystems @@ -2483,7 +2480,14 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) */ cgroup_call_pre_destroy(cgrp); - if (cgroup_has_css_refs(cgrp)) { + mutex_lock(&cgroup_mutex); + parent = cgrp->parent; + root = cgrp->root; + sb = root->sb; + + if (atomic_read(&cgrp->count) + || !list_empty(&cgrp->children) + || cgroup_has_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } -- cgit v1.2.3-70-g09d2 From 966c8c12dc9e77f931e2281ba25d2f0244b06949 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 19 Nov 2008 15:36:36 -0800 Subject: sprint_symbol(): use less stack sprint_symbol(), itself used when dumping stacks, has been wasting 128 bytes of stack: lookup the symbol directly into the buffer supplied by the caller, instead of using a locally declared namebuf. I believe the name != buffer strcpy() is obsolete: the design here dates from when module symbol lookup pointed into a supposedly const but sadly volatile table; nowadays it copies, but an uncalled strcpy() looks better here than the risk of a recursive BUG_ON(). Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kallsyms.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 5072cf1685a..7b8b0f21a5b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -304,17 +304,24 @@ int sprint_symbol(char *buffer, unsigned long address) char *modname; const char *name; unsigned long offset, size; - char namebuf[KSYM_NAME_LEN]; + int len; - name = kallsyms_lookup(address, &size, &offset, &modname, namebuf); + name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) return sprintf(buffer, "0x%lx", address); + if (name != buffer) + strcpy(buffer, name); + len = strlen(buffer); + buffer += len; + if (modname) - return sprintf(buffer, "%s+%#lx/%#lx [%s]", name, offset, - size, modname); + len += sprintf(buffer, "+%#lx/%#lx [%s]", + offset, size, modname); else - return sprintf(buffer, "%s+%#lx/%#lx", name, offset, size); + len += sprintf(buffer, "+%#lx/%#lx", offset, size); + + return len; } /* Look up a kernel symbol and print it to the kernel messages. */ -- cgit v1.2.3-70-g09d2 From 33d283bef23132c48195eafc21449f8ba88fce6b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 19 Nov 2008 15:36:48 -0800 Subject: cgroups: fix a serious bug in cgroupstats Try this, and you'll get oops immediately: # cd Documentation/accounting/ # gcc -o getdelays getdelays.c # mount -t cgroup -o debug xxx /mnt # ./getdelays -C /mnt/tasks Because a normal file's dentry->d_fsdata is a pointer to struct cftype, not struct cgroup. After the patch, it returns EINVAL if we try to get cgroupstats from a normal file. Cc: Balbir Singh Signed-off-by: Li Zefan Acked-by: Paul Menage Cc: [2.6.25.x, 2.6.26.x, 2.6.27.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1a06be61dcd..fe00b3b983a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2039,10 +2039,13 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) struct cgroup *cgrp; struct cgroup_iter it; struct task_struct *tsk; + /* - * Validate dentry by checking the superblock operations + * Validate dentry by checking the superblock operations, + * and make sure it's a directory. */ - if (dentry->d_sb->s_op != &cgroup_ops) + if (dentry->d_sb->s_op != &cgroup_ops || + !S_ISDIR(dentry->d_inode->i_mode)) goto err; ret = 0; -- cgit v1.2.3-70-g09d2 From 522a110b42b306d696cf84e34c677ed0e7080194 Mon Sep 17 00:00:00 2001 From: Liming Wang Date: Fri, 21 Nov 2008 11:00:18 +0800 Subject: function tracing: fix wrong position computing of stack_trace Impact: make output of stack_trace complete if buffer overruns When read buffer overruns, the output of stack_trace isn't complete. When printing records with seq_printf in t_show, if the read buffer has overruned by the current record, then this record won't be printed to user space through read buffer, it will just be dropped in this printing. When next printing, t_start should return the "*pos"th record, which is the one dropped by previous printing, but it just returns (m->private + *pos)th record. Here we use a more sane method to implement seq_operations which can be found in kernel code. Thus we needn't initialize m->private. About testing, it's not easy to overrun read buffer, but we can use seq_printf to print more padding bytes in t_show, then it's easy to check whether or not records are lost. This commit has been tested on both condition of overrun and non overrun. Signed-off-by: Liming Wang Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_stack.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index be682b62fe5..3bdb44bde4b 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -184,11 +184,16 @@ static struct file_operations stack_max_size_fops = { static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - long i = (long)m->private; + long i; (*pos)++; - i++; + if (v == SEQ_START_TOKEN) + i = 0; + else { + i = *(long *)v; + i++; + } if (i >= max_stack_trace.nr_entries || stack_dump_trace[i] == ULONG_MAX) @@ -201,12 +206,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { - void *t = &m->private; + void *t = SEQ_START_TOKEN; loff_t l = 0; local_irq_disable(); __raw_spin_lock(&max_stack_lock); + if (*pos == 0) + return SEQ_START_TOKEN; + for (; t && l < *pos; t = t_next(m, t, &l)) ; @@ -235,10 +243,10 @@ static int trace_lookup_stack(struct seq_file *m, long i) static int t_show(struct seq_file *m, void *v) { - long i = *(long *)v; + long i; int size; - if (i < 0) { + if (v == SEQ_START_TOKEN) { seq_printf(m, " Depth Size Location" " (%d entries)\n" " ----- ---- --------\n", @@ -246,6 +254,8 @@ static int t_show(struct seq_file *m, void *v) return 0; } + i = *(long *)v; + if (i >= max_stack_trace.nr_entries || stack_dump_trace[i] == ULONG_MAX) return 0; @@ -275,10 +285,6 @@ static int stack_trace_open(struct inode *inode, struct file *file) int ret; ret = seq_open(file, &stack_trace_seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = (void *)-1; - } return ret; } -- cgit v1.2.3-70-g09d2 From b0788caf7af773b6c2374590dabd3a205f0918a8 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 21 Nov 2008 15:57:32 +0800 Subject: lockdep: consistent alignement for lockdep info Impact: prettify /proc/lockdep_info Just feel odd that not all lines of lockdep info are aligned. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 06e157119d2..46a404173db 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3276,10 +3276,10 @@ void __init lockdep_info(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); - printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); + printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); - printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); + printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); -- cgit v1.2.3-70-g09d2 From 7ee1768ddb3075ae3a0801cc2d0ea4195530a7db Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Sun, 23 Nov 2008 21:24:30 +0200 Subject: x86, mmiotrace: fix buffer overrun detection Impact: fix mmiotrace overrun tracing When ftrace framework moved to use the ring buffer facility, the buffer overrun detection was broken after 2.6.27 by commit | commit 3928a8a2d98081d1bc3c0a84a2d70e29b90ecf1c | Author: Steven Rostedt | Date: Mon Sep 29 23:02:41 2008 -0400 | | ftrace: make work with new ring buffer | | This patch ports ftrace over to the new ring buffer. The detection is now fixed by using the ring buffer API. When mmiotrace detects a buffer overrun, it will report the number of lost events. People reading an mmiotrace log must know if something was missed, otherwise the data may not make sense. Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace_mmiotrace.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index f28484618ff..e62cbf78eab 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -18,12 +18,14 @@ struct header_iter { static struct trace_array *mmio_trace_array; static bool overrun_detected; +static unsigned long prev_overruns; static void mmio_reset_data(struct trace_array *tr) { int cpu; overrun_detected = false; + prev_overruns = 0; tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) @@ -128,16 +130,12 @@ static void mmio_close(struct trace_iterator *iter) static unsigned long count_overruns(struct trace_iterator *iter) { - int cpu; unsigned long cnt = 0; -/* FIXME: */ -#if 0 - for_each_online_cpu(cpu) { - cnt += iter->overrun[cpu]; - iter->overrun[cpu] = 0; - } -#endif - (void)cpu; + unsigned long over = ring_buffer_overruns(iter->tr->buffer); + + if (over > prev_overruns) + cnt = over - prev_overruns; + prev_overruns = over; return cnt; } -- cgit v1.2.3-70-g09d2 From 4f5a7f40ddbae98569acbb99118a98570315579c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Thu, 27 Nov 2008 10:21:46 +0800 Subject: ftrace: prevent recursion Impact: prevent unnecessary stack recursion if the resched flag was set before we entered, then don't reschedule. Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f780e9552f9..668bbb5ef2b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1215,7 +1215,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, out: if (resched) - preempt_enable_notrace(); + preempt_enable_no_resched_notrace(); else preempt_enable_notrace(); return NULL; -- cgit v1.2.3-70-g09d2 From 4cd4262034849da01eb88659af677b69f8169f06 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 26 Nov 2008 21:04:24 -0500 Subject: sched: prevent divide by zero error in cpu_avg_load_per_task Impact: fix divide by zero crash in scheduler rebalance irq While testing the branch profiler, I hit this crash: divide error: 0000 [#1] PREEMPT SMP [...] RIP: 0010:[] [] cpu_avg_load_per_task+0x50/0x7f [...] Call Trace: <0> [] find_busiest_group+0x3e5/0xcaa [] rebalance_domains+0x2da/0xa21 [] ? find_next_bit+0x1b2/0x1e6 [] run_rebalance_domains+0x112/0x19f [] __do_softirq+0xa8/0x232 [] call_softirq+0x1c/0x3e [] do_softirq+0x94/0x1cd [] irq_exit+0x6b/0x10e [] smp_apic_timer_interrupt+0xd3/0xff [] apic_timer_interrupt+0x13/0x20 The code for cpu_avg_load_per_task has: if (rq->nr_running) rq->avg_load_per_task = rq->load.weight / rq->nr_running; The runqueue lock is not held here, and there is nothing that prevents the rq->nr_running from going to zero after it passes the if condition. The branch profiler simply made the race window bigger. This patch saves off the rq->nr_running to a local variable and uses that for both the condition and the division. Signed-off-by: Steven Rostedt Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9b1e79371c2..700aa9a1413 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1453,9 +1453,10 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long nr_running = rq->nr_running; - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; + if (nr_running) + rq->avg_load_per_task = rq->load.weight / nr_running; else rq->avg_load_per_task = 0; -- cgit v1.2.3-70-g09d2 From 1583715ddb61f822041807a0f18b3b4845e88c76 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 25 Nov 2008 10:27:49 +0100 Subject: sched, cpusets: fix warning in kernel/cpuset.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit this warning: kernel/cpuset.c: In function ‘generate_sched_domains’: kernel/cpuset.c:588: warning: ‘ndoms’ may be used uninitialized in this function triggers because GCC does not recognize that ndoms stays uninitialized only if doms is NULL - but that flow is covered at the end of generate_sched_domains(). Help out GCC by initializing this variable to 0. (that's prudent anyway) Also, this function needs a splitup and code flow simplification: with 160 lines length it's clearly too long. Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index da7ff6137f3..96c0ba13b8c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -585,7 +585,7 @@ static int generate_sched_domains(cpumask_t **domains, int i, j, k; /* indices for partition finding loops */ cpumask_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ - int ndoms; /* number of sched domains in result */ + int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ doms = NULL; -- cgit v1.2.3-70-g09d2 From af6d596fd603219b054c1c90fb16672a9fd441bd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 29 Nov 2008 20:45:15 +0100 Subject: sched: prevent divide by zero error in cpu_avg_load_per_task, update Regarding the bug addressed in: 4cd4262: sched: prevent divide by zero error in cpu_avg_load_per_task Linus points out that the fix is not complete: > There's nothing that keeps gcc from deciding not to reload > rq->nr_running. > > Of course, in _practice_, I don't think gcc ever will (if it decides > that it will spill, gcc is likely going to decide that it will > literally spill the local variable to the stack rather than decide to > reload off the pointer), but it's a valid compiler optimization, and > it even has a name (rematerialization). > > So I suspect that your patch does fix the bug, but it still leaves the > fairly unlikely _potential_ for it to re-appear at some point. > > We have ACCESS_ONCE() as a macro to guarantee that the compiler > doesn't rematerialize a pointer access. That also would clarify > the fact that we access something unsafe outside a lock. So make sure our nr_running value is immutable and cannot change after we check it for nonzero. Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 700aa9a1413..b7480fb5c3d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1453,7 +1453,7 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = rq->nr_running; + unsigned long nr_running = ACCESS_ONCE(rq->nr_running); if (nr_running) rq->avg_load_per_task = rq->load.weight / nr_running; -- cgit v1.2.3-70-g09d2 From 8419641450edc838a6ce7cdf0f99d262bf0af2d5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 22 Nov 2008 17:36:44 +0000 Subject: cpuinit fixes in kernel/* Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/cpu.c | 2 +- kernel/profile.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 5a732c5ef08..8ea32e8d68b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -462,7 +462,7 @@ out: * It must be called by the arch code on the new cpu, before the new cpu * enables interrupts and before the "boot" cpu returns from __cpu_up(). */ -void notify_cpu_starting(unsigned int cpu) +void __cpuinit notify_cpu_starting(unsigned int cpu) { unsigned long val = CPU_STARTING; diff --git a/kernel/profile.c b/kernel/profile.c index 5b7d1ac7124..dc41827fbfe 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -351,7 +351,7 @@ out: put_cpu(); } -static int __devinit profile_cpu_callback(struct notifier_block *info, +static int __cpuinit profile_cpu_callback(struct notifier_block *info, unsigned long action, void *__cpu) { int node, cpu = (unsigned long)__cpu; @@ -596,7 +596,7 @@ out_cleanup: #define create_hash_tables() ({ 0; }) #endif -int create_proc_profile(void) +int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ { struct proc_dir_entry *entry; -- cgit v1.2.3-70-g09d2 From 96b8936a9ed08746e47081458a5eb9e43a751e24 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Nov 2008 08:10:03 +0100 Subject: remove __ARCH_WANT_COMPAT_SYS_PTRACE All architectures now use the generic compat_sys_ptrace, as should every new architecture that needs 32bit compat (if we'll ever get another). Remove the now superflous __ARCH_WANT_COMPAT_SYS_PTRACE define, and also kill a comment about __ARCH_SYS_PTRACE that was added after __ARCH_SYS_PTRACE was already gone. Signed-off-by: Christoph Hellwig Acked-by: David S. Miller Signed-off-by: Linus Torvalds --- arch/Kconfig | 2 -- arch/ia64/include/asm/ptrace.h | 2 -- arch/mips/include/asm/ptrace.h | 4 ---- arch/parisc/include/asm/ptrace.h | 2 -- arch/powerpc/include/asm/ptrace.h | 2 -- arch/s390/include/asm/ptrace.h | 2 -- arch/sparc/include/asm/ptrace_64.h | 2 -- arch/x86/include/asm/ptrace.h | 2 -- include/linux/compat.h | 2 -- kernel/ptrace.c | 4 ++-- 10 files changed, 2 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 8977d99987c..471e72dbaf8 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -79,8 +79,6 @@ config HAVE_KRETPROBES # task_pt_regs() in asm/processor.h or asm/ptrace.h # arch_has_single_step() if there is hardware single-step support # arch_has_block_step() if there is hardware block-step support -# arch_ptrace() and not #define __ARCH_SYS_PTRACE -# compat_arch_ptrace() and #define __ARCH_WANT_COMPAT_SYS_PTRACE # asm/syscall.h supplying asm-generic/syscall.h interface # linux/regset.h user_regset interfaces # CORE_DUMP_USE_REGSET #define'd in linux/elf.h diff --git a/arch/ia64/include/asm/ptrace.h b/arch/ia64/include/asm/ptrace.h index 6417c1ecb44..14055c636ad 100644 --- a/arch/ia64/include/asm/ptrace.h +++ b/arch/ia64/include/asm/ptrace.h @@ -325,8 +325,6 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs) #define arch_has_block_step() (1) extern void user_enable_block_step(struct task_struct *); -#define __ARCH_WANT_COMPAT_SYS_PTRACE - #endif /* !__KERNEL__ */ /* pt_all_user_regs is used for PTRACE_GETREGS PTRACE_SETREGS */ diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index 813abd16255..c2c8bac4330 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -9,10 +9,6 @@ #ifndef _ASM_PTRACE_H #define _ASM_PTRACE_H -#ifdef CONFIG_64BIT -#define __ARCH_WANT_COMPAT_SYS_PTRACE -#endif - /* 0 - 31 are integer registers, 32 - 63 are fp registers. */ #define FPR_BASE 32 #define PC 64 diff --git a/arch/parisc/include/asm/ptrace.h b/arch/parisc/include/asm/ptrace.h index afa5333187b..302f68dc889 100644 --- a/arch/parisc/include/asm/ptrace.h +++ b/arch/parisc/include/asm/ptrace.h @@ -47,8 +47,6 @@ struct pt_regs { #define task_regs(task) ((struct pt_regs *) ((char *)(task) + TASK_REGS)) -#define __ARCH_WANT_COMPAT_SYS_PTRACE - struct task_struct; #define arch_has_single_step() 1 void user_disable_single_step(struct task_struct *task); diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 280a90cc989..c9c678fb253 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -55,8 +55,6 @@ struct pt_regs { #ifdef __powerpc64__ -#define __ARCH_WANT_COMPAT_SYS_PTRACE - #define STACK_FRAME_OVERHEAD 112 /* size of minimum stack frame */ #define STACK_FRAME_LR_SAVE 2 /* Location of LR in stack frame */ #define STACK_FRAME_REGS_MARKER ASM_CONST(0x7265677368657265) diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index a7226f8143f..560ce8561df 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -486,8 +486,6 @@ struct task_struct; extern void user_enable_single_step(struct task_struct *); extern void user_disable_single_step(struct task_struct *); -#define __ARCH_WANT_COMPAT_SYS_PTRACE - #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN) #define user_stack_pointer(regs)((regs)->gprs[15]) diff --git a/arch/sparc/include/asm/ptrace_64.h b/arch/sparc/include/asm/ptrace_64.h index 3d3e9c161d8..84e969f06af 100644 --- a/arch/sparc/include/asm/ptrace_64.h +++ b/arch/sparc/include/asm/ptrace_64.h @@ -142,8 +142,6 @@ struct global_reg_snapshot { }; extern struct global_reg_snapshot global_reg_snapshot[NR_CPUS]; -#define __ARCH_WANT_COMPAT_SYS_PTRACE - #define force_successful_syscall_return() \ do { current_thread_info()->syscall_noerror = 1; \ } while (0) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index d1531c8480b..eefb0594b05 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -271,8 +271,6 @@ extern int do_get_thread_area(struct task_struct *p, int idx, extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); -#define __ARCH_WANT_COMPAT_SYS_PTRACE - #endif /* __KERNEL__ */ #endif /* !__ASSEMBLY__ */ diff --git a/include/linux/compat.h b/include/linux/compat.h index f061a1ea1b7..e88f3ecf38b 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -252,12 +252,10 @@ extern int compat_ptrace_request(struct task_struct *child, compat_long_t request, compat_ulong_t addr, compat_ulong_t data); -#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t addr, compat_ulong_t data); asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, compat_long_t addr, compat_long_t data); -#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */ /* * epoll (fs/eventpoll.c) compat bits follow ... diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1e68e4c39e2..4c8bcd7dd8e 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -612,7 +612,7 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) return (copied == sizeof(data)) ? 0 : -EIO; } -#if defined CONFIG_COMPAT && defined __ARCH_WANT_COMPAT_SYS_PTRACE +#if defined CONFIG_COMPAT #include int compat_ptrace_request(struct task_struct *child, compat_long_t request, @@ -709,4 +709,4 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, unlock_kernel(); return ret; } -#endif /* CONFIG_COMPAT && __ARCH_WANT_COMPAT_SYS_PTRACE */ +#endif /* CONFIG_COMPAT */ -- cgit v1.2.3-70-g09d2 From 7ef9964e6d1b911b78709f144000aacadd0ebc21 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Mon, 1 Dec 2008 13:13:55 -0800 Subject: epoll: introduce resource usage limits It has been thought that the per-user file descriptors limit would also limit the resources that a normal user can request via the epoll interface. Vegard Nossum reported a very simple program (a modified version attached) that can make a normal user to request a pretty large amount of kernel memory, well within the its maximum number of fds. To solve such problem, default limits are now imposed, and /proc based configuration has been introduced. A new directory has been created, named /proc/sys/fs/epoll/ and inside there, there are two configuration points: max_user_instances = Maximum number of devices - per user max_user_watches = Maximum number of "watched" fds - per user The current default for "max_user_watches" limits the memory used by epoll to store "watches", to 1/32 of the amount of the low RAM. As example, a 256MB 32bit machine, will have "max_user_watches" set to roughly 90000. That should be enough to not break existing heavy epoll users. The default value for "max_user_instances" is set to 128, that should be enough too. This also changes the userspace, because a new error code can now come out from EPOLL_CTL_ADD (-ENOSPC). The EMFILE from epoll_create() was already listed, so that should be ok. [akpm@linux-foundation.org: use get_current_user()] Signed-off-by: Davide Libenzi Cc: Michael Kerrisk Cc: Cc: Cyrill Gorcunov Reported-by: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 27 ++++++++++++ fs/eventpoll.c | 85 ++++++++++++++++++++++++++++++++++---- include/linux/sched.h | 4 ++ kernel/sysctl.c | 10 +++++ 4 files changed, 118 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index bcceb99b81d..bb1b0dd3bfc 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -44,6 +44,7 @@ Table of Contents 2.14 /proc//io - Display the IO accounting fields 2.15 /proc//coredump_filter - Core dump filtering settings 2.16 /proc//mountinfo - Information about mounts + 2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface ------------------------------------------------------------------------------ Preface @@ -2483,4 +2484,30 @@ For more information on mount propagation see: Documentation/filesystems/sharedsubtree.txt +2.17 /proc/sys/fs/epoll - Configuration options for the epoll interface +-------------------------------------------------------- + +This directory contains configuration options for the epoll(7) interface. + +max_user_instances +------------------ + +This is the maximum number of epoll file descriptors that a single user can +have open at a given time. The default value is 128, and should be enough +for normal users. + +max_user_watches +---------------- + +Every epoll file descriptor can store a number of files to be monitored +for event readiness. Each one of these monitored files constitutes a "watch". +This configuration option sets the maximum number of "watches" that are +allowed for each user. +Each "watch" costs roughly 90 bytes on a 32bit kernel, and roughly 160 bytes +on a 64bit one. +The current default value for max_user_watches is the 1/32 of the available +low memory, divided for the "watch" cost in bytes. + + ------------------------------------------------------------------------------ + diff --git a/fs/eventpoll.c b/fs/eventpoll.c index aec5c13f634..96355d50534 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -102,6 +102,8 @@ #define EP_UNACTIVE_PTR ((void *) -1L) +#define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) + struct epoll_filefd { struct file *file; int fd; @@ -200,6 +202,9 @@ struct eventpoll { * holding ->lock. */ struct epitem *ovflist; + + /* The user that created the eventpoll descriptor */ + struct user_struct *user; }; /* Wait structure used by the poll hooks */ @@ -226,10 +231,18 @@ struct ep_pqueue { struct epitem *epi; }; +/* + * Configuration options available inside /proc/sys/fs/epoll/ + */ +/* Maximum number of epoll devices, per user */ +static int max_user_instances __read_mostly; +/* Maximum number of epoll watched descriptors, per user */ +static int max_user_watches __read_mostly; + /* * This mutex is used to serialize ep_free() and eventpoll_release_file(). */ -static struct mutex epmutex; +static DEFINE_MUTEX(epmutex); /* Safe wake up implementation */ static struct poll_safewake psw; @@ -240,6 +253,33 @@ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly; +#ifdef CONFIG_SYSCTL + +#include + +static int zero; + +ctl_table epoll_table[] = { + { + .procname = "max_user_instances", + .data = &max_user_instances, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "max_user_watches", + .data = &max_user_watches, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero, + }, + { .ctl_name = 0 } +}; +#endif /* CONFIG_SYSCTL */ + /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, @@ -402,6 +442,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) /* At this point it is safe to free the eventpoll item */ kmem_cache_free(epi_cache, epi); + atomic_dec(&ep->user->epoll_watches); + DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n", current, ep, file)); @@ -449,6 +491,8 @@ static void ep_free(struct eventpoll *ep) mutex_unlock(&epmutex); mutex_destroy(&ep->mtx); + atomic_dec(&ep->user->epoll_devs); + free_uid(ep->user); kfree(ep); } @@ -532,10 +576,19 @@ void eventpoll_release_file(struct file *file) static int ep_alloc(struct eventpoll **pep) { - struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL); + int error; + struct user_struct *user; + struct eventpoll *ep; - if (!ep) - return -ENOMEM; + user = get_current_user(); + error = -EMFILE; + if (unlikely(atomic_read(&user->epoll_devs) >= + max_user_instances)) + goto free_uid; + error = -ENOMEM; + ep = kzalloc(sizeof(*ep), GFP_KERNEL); + if (unlikely(!ep)) + goto free_uid; spin_lock_init(&ep->lock); mutex_init(&ep->mtx); @@ -544,12 +597,17 @@ static int ep_alloc(struct eventpoll **pep) INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT; ep->ovflist = EP_UNACTIVE_PTR; + ep->user = user; *pep = ep; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", current, ep)); return 0; + +free_uid: + free_uid(user); + return error; } /* @@ -703,9 +761,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct epitem *epi; struct ep_pqueue epq; - error = -ENOMEM; + if (unlikely(atomic_read(&ep->user->epoll_watches) >= + max_user_watches)) + return -ENOSPC; if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) - goto error_return; + return -ENOMEM; /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); @@ -735,6 +795,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ + error = -ENOMEM; if (epi->nwait < 0) goto error_unregister; @@ -765,6 +826,8 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, spin_unlock_irqrestore(&ep->lock, flags); + atomic_inc(&ep->user->epoll_watches); + /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&psw, &ep->poll_wait); @@ -789,7 +852,7 @@ error_unregister: spin_unlock_irqrestore(&ep->lock, flags); kmem_cache_free(epi_cache, epi); -error_return: + return error; } @@ -1078,6 +1141,7 @@ asmlinkage long sys_epoll_create1(int flags) flags & O_CLOEXEC); if (fd < 0) ep_free(ep); + atomic_inc(&ep->user->epoll_devs); error_return: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", @@ -1299,7 +1363,12 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, static int __init eventpoll_init(void) { - mutex_init(&epmutex); + struct sysinfo si; + + si_meminfo(&si); + max_user_instances = 128; + max_user_watches = (((si.totalram - si.totalhigh) / 32) << PAGE_SHIFT) / + EP_ITEM_COST; /* Initialize the structure used to perform safe poll wait head wake ups */ ep_poll_safewake_init(&psw); diff --git a/include/linux/sched.h b/include/linux/sched.h index 644ffbda17c..55e30d11447 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -630,6 +630,10 @@ struct user_struct { atomic_t inotify_watches; /* How many inotify watches does this user have? */ atomic_t inotify_devs; /* How many inotify devs does this user have opened? */ #endif +#ifdef CONFIG_EPOLL + atomic_t epoll_devs; /* The number of epoll descriptors currently open */ + atomic_t epoll_watches; /* The number of file descriptors currently watched */ +#endif #ifdef CONFIG_POSIX_MQUEUE /* protected by mq_lock */ unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9d048fa2d90..3d56fe7570d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -176,6 +176,9 @@ extern struct ctl_table random_table[]; #ifdef CONFIG_INOTIFY_USER extern struct ctl_table inotify_table[]; #endif +#ifdef CONFIG_EPOLL +extern struct ctl_table epoll_table[]; +#endif #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT int sysctl_legacy_va_layout; @@ -1325,6 +1328,13 @@ static struct ctl_table fs_table[] = { .child = inotify_table, }, #endif +#ifdef CONFIG_EPOLL + { + .procname = "epoll", + .mode = 0555, + .child = epoll_table, + }, +#endif #endif { .ctl_name = KERN_SETUID_DUMPABLE, -- cgit v1.2.3-70-g09d2 From a8005992836434cab6182c6147993d21442184c1 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 1 Dec 2008 13:14:00 -0800 Subject: taint: add missing comment The description for 'D' was missing in the comment... (causing me a minute of WTF followed by looking at more of the code) Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 6513aac8e99..4d5088355bf 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -167,6 +167,7 @@ static const struct tnt tnts[] = { * 'M' - System experienced a machine check exception. * 'B' - System has hit bad_page. * 'U' - Userspace-defined naughtiness. + * 'D' - Kernel has oopsed before * 'A' - ACPI table overridden. * 'W' - Taint on warning. * 'C' - modules from drivers/staging are loaded. -- cgit v1.2.3-70-g09d2