From 1a38416cea8ac801ae8f261074721f35317613dc Mon Sep 17 00:00:00 2001
From: David Shaohua Li <shaohua.li@intel.com>
Date: Wed, 23 Nov 2005 12:36:00 -0500
Subject: [ACPI] SMP S3 resume: evaluate _WAK after INIT

On SMP resume from S3, we reset (INIT) the non-boot
processors to boot them cleanly.  But the BIOS needs
to execute _WAK after INIT in order to properly
initialized these processors upon resume.

http://bugzilla.kernel.org/show_bug.cgi?id=5651

Signed-off-by: David Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 kernel/power/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index d253f3ae2fa..9cb235cba4a 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -133,10 +133,10 @@ static int suspend_enter(suspend_state_t state)
 static void suspend_finish(suspend_state_t state)
 {
 	device_resume();
-	if (pm_ops && pm_ops->finish)
-		pm_ops->finish(state);
 	thaw_processes();
 	enable_nonboot_cpus();
+	if (pm_ops && pm_ops->finish)
+		pm_ops->finish(state);
 	pm_restore_console();
 }
 
-- 
cgit v1.2.3-70-g09d2


From 729b4d4ce1982c52040bbf22d6711cdf8db07ad8 Mon Sep 17 00:00:00 2001
From: Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>
Date: Thu, 1 Dec 2005 04:29:00 -0500
Subject: [ACPI] fix reboot upon suspend-to-disk

http://bugzilla.kernel.org/show_bug.cgi?id=4320

Signed-off-by: Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/acpi/sleep/poweroff.c | 15 +++++++++------
 drivers/acpi/sleep/sleep.h    |  2 +-
 drivers/acpi/sleep/wakeup.c   |  6 +++---
 include/linux/kernel.h        |  1 +
 include/linux/reboot.h        |  3 +--
 kernel/power/disk.c           |  9 +--------
 kernel/sys.c                  | 25 ++++++++++---------------
 7 files changed, 26 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/sleep/poweroff.c b/drivers/acpi/sleep/poweroff.c
index af7935a95bc..47fb4b394ee 100644
--- a/drivers/acpi/sleep/poweroff.c
+++ b/drivers/acpi/sleep/poweroff.c
@@ -33,9 +33,7 @@ int acpi_sleep_prepare(u32 acpi_state)
 	ACPI_FLUSH_CPU_CACHE();
 	acpi_enable_wakeup_device_prep(acpi_state);
 #endif
-	if (acpi_state == ACPI_STATE_S5) {
-		acpi_wakeup_gpe_poweroff_prepare();
-	}
+	acpi_gpe_sleep_prepare(acpi_state);
 	acpi_enter_sleep_state_prep(acpi_state);
 	return 0;
 }
@@ -53,11 +51,16 @@ void acpi_power_off(void)
 
 static int acpi_shutdown(struct sys_device *x)
 {
-	if (system_state == SYSTEM_POWER_OFF) {
-		/* Prepare if we are going to power off the system */
+	switch (system_state) {
+	case SYSTEM_POWER_OFF:
+		/* Prepare to power off the system */
 		return acpi_sleep_prepare(ACPI_STATE_S5);
+	case SYSTEM_SUSPEND_DISK:
+		/* Prepare to suspend the system to disk */
+		return acpi_sleep_prepare(ACPI_STATE_S4);
+	default:
+		return 0;
 	}
-	return 0;
 }
 
 static struct sysdev_class acpi_sysclass = {
diff --git a/drivers/acpi/sleep/sleep.h b/drivers/acpi/sleep/sleep.h
index efd0001c6f0..f3e70397a7d 100644
--- a/drivers/acpi/sleep/sleep.h
+++ b/drivers/acpi/sleep/sleep.h
@@ -5,4 +5,4 @@ extern int acpi_suspend (u32 state);
 extern void acpi_enable_wakeup_device_prep(u8 sleep_state);
 extern void acpi_enable_wakeup_device(u8 sleep_state);
 extern void acpi_disable_wakeup_device(u8 sleep_state);
-extern void acpi_wakeup_gpe_poweroff_prepare(void);
+extern void acpi_gpe_sleep_prepare(u32 sleep_state);
diff --git a/drivers/acpi/sleep/wakeup.c b/drivers/acpi/sleep/wakeup.c
index 4134ed43d02..85df0ceda2a 100644
--- a/drivers/acpi/sleep/wakeup.c
+++ b/drivers/acpi/sleep/wakeup.c
@@ -192,7 +192,7 @@ late_initcall(acpi_wakeup_device_init);
  * RUNTIME GPEs, we simply mark all GPES that
  * are not enabled for wakeup from S5 as RUNTIME.
  */
-void acpi_wakeup_gpe_poweroff_prepare(void)
+void acpi_gpe_sleep_prepare(u32 sleep_state)
 {
 	struct list_head *node, *next;
 
@@ -201,8 +201,8 @@ void acpi_wakeup_gpe_poweroff_prepare(void)
 						       struct acpi_device,
 						       wakeup_list);
 
-		/* The GPE can wakeup system from S5, don't touch it */
-		if ((u32) dev->wakeup.sleep_state == ACPI_STATE_S5)
+		/* The GPE can wakeup system from this state, don't touch it */
+		if ((u32) dev->wakeup.sleep_state >= sleep_state)
 			continue;
 		/* acpi_set_gpe_type will automatically disable GPE */
 		acpi_set_gpe_type(dev->wakeup.gpe_device,
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index b1e407a4fbd..73aa55a7333 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -181,6 +181,7 @@ extern enum system_states {
 	SYSTEM_HALT,
 	SYSTEM_POWER_OFF,
 	SYSTEM_RESTART,
+	SYSTEM_SUSPEND_DISK,
 } system_state;
 
 #define TAINT_PROPRIETARY_MODULE	(1<<0)
diff --git a/include/linux/reboot.h b/include/linux/reboot.h
index 7ab2cdb83ef..015297ff73f 100644
--- a/include/linux/reboot.h
+++ b/include/linux/reboot.h
@@ -60,8 +60,7 @@ extern void machine_crash_shutdown(struct pt_regs *);
  */
 
 extern void kernel_restart_prepare(char *cmd);
-extern void kernel_halt_prepare(void);
-extern void kernel_power_off_prepare(void);
+extern void kernel_shutdown_prepare(enum system_states state);
 
 extern void kernel_restart(char *cmd);
 extern void kernel_halt(void);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 027322a564f..f2cd279d07c 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -52,7 +52,7 @@ static void power_down(suspend_disk_method_t mode)
 
 	switch(mode) {
 	case PM_DISK_PLATFORM:
-		kernel_power_off_prepare();
+		kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK);
 		error = pm_ops->enter(PM_SUSPEND_DISK);
 		break;
 	case PM_DISK_SHUTDOWN:
@@ -119,13 +119,6 @@ static int prepare_processes(void)
 		goto thaw;
 	}
 
-	if (pm_disk_mode == PM_DISK_PLATFORM) {
-		if (pm_ops && pm_ops->prepare) {
-			if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
-				goto thaw;
-		}
-	}
-
 	/* Free memory before shutting down devices. */
 	free_some_memory();
 	return 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index eecf84526af..c3b1874661f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -427,23 +427,25 @@ void kernel_kexec(void)
 }
 EXPORT_SYMBOL_GPL(kernel_kexec);
 
+void kernel_shutdown_prepare(enum system_states state)
+{
+	notifier_call_chain(&reboot_notifier_list,
+		(state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
+	system_state = state;
+	device_shutdown();
+}
 /**
  *	kernel_halt - halt the system
  *
  *	Shutdown everything and perform a clean system halt.
  */
-void kernel_halt_prepare(void)
-{
-	notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
-	system_state = SYSTEM_HALT;
-	device_shutdown();
-}
 void kernel_halt(void)
 {
-	kernel_halt_prepare();
+	kernel_shutdown_prepare(SYSTEM_HALT);
 	printk(KERN_EMERG "System halted.\n");
 	machine_halt();
 }
+
 EXPORT_SYMBOL_GPL(kernel_halt);
 
 /**
@@ -451,20 +453,13 @@ EXPORT_SYMBOL_GPL(kernel_halt);
  *
  *	Shutdown everything and perform a clean system power_off.
  */
-void kernel_power_off_prepare(void)
-{
-	notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
-	system_state = SYSTEM_POWER_OFF;
-	device_shutdown();
-}
 void kernel_power_off(void)
 {
-	kernel_power_off_prepare();
+	kernel_shutdown_prepare(SYSTEM_POWER_OFF);
 	printk(KERN_EMERG "Power down.\n");
 	machine_power_off();
 }
 EXPORT_SYMBOL_GPL(kernel_power_off);
-
 /*
  * Reboot system call: for obvious reasons only root may call it,
  * and even root needs to set up some magic numbers in the registers
-- 
cgit v1.2.3-70-g09d2


From 951069e311a2a931bf7c9d838db860f90bf14c45 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Tue, 31 Jan 2006 10:16:55 -0800
Subject: Don't try to "validate" a non-existing timeval.

settime() with a NULL timeval is silly but legal.

Noticed by Dave Jones <davej@redhat.com>

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time.c     | 2 +-
 security/seclvl.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index 7477b1d2079..1f23e683d6a 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -155,7 +155,7 @@ int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
 	static int firsttime = 1;
 	int error = 0;
 
-	if (!timespec_valid(tv))
+	if (tv && !timespec_valid(tv))
 		return -EINVAL;
 
 	error = security_settime(tv, tz);
diff --git a/security/seclvl.c b/security/seclvl.c
index 1caac016464..8529ea6f7aa 100644
--- a/security/seclvl.c
+++ b/security/seclvl.c
@@ -368,8 +368,8 @@ static int seclvl_capable(struct task_struct *tsk, int cap)
  */
 static int seclvl_settime(struct timespec *tv, struct timezone *tz)
 {
-	struct timespec now;
-	if (seclvl > 1) {
+	if (tv && seclvl > 1) {
+		struct timespec now;
 		now = current_kernel_time();
 		if (tv->tv_sec < now.tv_sec ||
 		    (tv->tv_sec == now.tv_sec && tv->tv_nsec < now.tv_nsec)) {
-- 
cgit v1.2.3-70-g09d2


From 70b4d63e98fd93fb9742708a54f872cba24e0fea Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 30 Jan 2006 20:24:38 +0100
Subject: [PATCH] Fix boot-time slowdown for measure_migration_cost

This reduces the amount of time the migration cost calculations cost
during bootup. Based on numbers by Tony Luck <tony.luck@intel.com>.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3ee2ae45125..ec7fd9cee30 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5141,7 +5141,7 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
 #define SEARCH_SCOPE		2
 #define MIN_CACHE_SIZE		(64*1024U)
 #define DEFAULT_CACHE_SIZE	(5*1024*1024U)
-#define ITERATIONS		2
+#define ITERATIONS		1
 #define SIZE_THRESH		130
 #define COST_THRESH		130
 
@@ -5480,9 +5480,9 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
 				break;
 			}
 		/*
-		 * Increase the cachesize in 5% steps:
+		 * Increase the cachesize in 10% steps:
 		 */
-		size = size * 20 / 19;
+		size = size * 10 / 9;
 	}
 
 	if (migration_debug)
-- 
cgit v1.2.3-70-g09d2


From 4021cb279a532728c3208a16b9b09b0ca8016850 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Jan 2006 15:23:07 +0100
Subject: [PATCH] fix uidhash_lock <-> RCU deadlock

RCU task-struct freeing can call free_uid(), which is taking
uidhash_lock - while other users of uidhash_lock are softirq-unsafe.

The fix is to always take the uidhash_spinlock in a softirq-safe manner.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/user.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 89e562feb1b..d1ae2349347 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/bitops.h>
 #include <linux/key.h>
+#include <linux/interrupt.h>
 
 /*
  * UID task count cache, to get fast user lookup in "alloc_uid"
@@ -27,6 +28,12 @@
 
 static kmem_cache_t *uid_cachep;
 static struct list_head uidhash_table[UIDHASH_SZ];
+
+/*
+ * The uidhash_lock is mostly taken from process context, but it is
+ * occasionally also taken from softirq/tasklet context, when
+ * task-structs get RCU-freed. Hence all locking must be softirq-safe.
+ */
 static DEFINE_SPINLOCK(uidhash_lock);
 
 struct user_struct root_user = {
@@ -83,14 +90,15 @@ struct user_struct *find_user(uid_t uid)
 {
 	struct user_struct *ret;
 
-	spin_lock(&uidhash_lock);
+	spin_lock_bh(&uidhash_lock);
 	ret = uid_hash_find(uid, uidhashentry(uid));
-	spin_unlock(&uidhash_lock);
+	spin_unlock_bh(&uidhash_lock);
 	return ret;
 }
 
 void free_uid(struct user_struct *up)
 {
+	local_bh_disable();
 	if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
 		uid_hash_remove(up);
 		key_put(up->uid_keyring);
@@ -98,6 +106,7 @@ void free_uid(struct user_struct *up)
 		kmem_cache_free(uid_cachep, up);
 		spin_unlock(&uidhash_lock);
 	}
+	local_bh_enable();
 }
 
 struct user_struct * alloc_uid(uid_t uid)
@@ -105,9 +114,9 @@ struct user_struct * alloc_uid(uid_t uid)
 	struct list_head *hashent = uidhashentry(uid);
 	struct user_struct *up;
 
-	spin_lock(&uidhash_lock);
+	spin_lock_bh(&uidhash_lock);
 	up = uid_hash_find(uid, hashent);
-	spin_unlock(&uidhash_lock);
+	spin_unlock_bh(&uidhash_lock);
 
 	if (!up) {
 		struct user_struct *new;
@@ -137,7 +146,7 @@ struct user_struct * alloc_uid(uid_t uid)
 		 * Before adding this, check whether we raced
 		 * on adding the same user already..
 		 */
-		spin_lock(&uidhash_lock);
+		spin_lock_bh(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
 			key_put(new->uid_keyring);
@@ -147,7 +156,7 @@ struct user_struct * alloc_uid(uid_t uid)
 			uid_hash_insert(new, hashent);
 			up = new;
 		}
-		spin_unlock(&uidhash_lock);
+		spin_unlock_bh(&uidhash_lock);
 
 	}
 	return up;
@@ -183,9 +192,9 @@ static int __init uid_cache_init(void)
 		INIT_LIST_HEAD(uidhash_table + n);
 
 	/* Insert the root user immediately (init already runs as root) */
-	spin_lock(&uidhash_lock);
+	spin_lock_bh(&uidhash_lock);
 	uid_hash_insert(&root_user, uidhashentry(0));
-	spin_unlock(&uidhash_lock);
+	spin_unlock_bh(&uidhash_lock);
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From adac1665234dd966990af846eccd20b7f4923279 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Jan 2006 19:50:12 +0100
Subject: [PATCH] rcu_torture_lock deadlock fix

rcu_torture_lock is used in a softirq-unsafe manner, but it is also
taken by rcu_torture_cb(), which may execute in softirq-context,
resulting in potential deadlocks.

The fix is to acquire rcu_torture_lock in a softirq-safe manner.  With
this fix applied, the rcu-torture code passes validation.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 773219907dd..7712912dbc8 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -114,16 +114,16 @@ rcu_torture_alloc(void)
 {
 	struct list_head *p;
 
-	spin_lock(&rcu_torture_lock);
+	spin_lock_bh(&rcu_torture_lock);
 	if (list_empty(&rcu_torture_freelist)) {
 		atomic_inc(&n_rcu_torture_alloc_fail);
-		spin_unlock(&rcu_torture_lock);
+		spin_unlock_bh(&rcu_torture_lock);
 		return NULL;
 	}
 	atomic_inc(&n_rcu_torture_alloc);
 	p = rcu_torture_freelist.next;
 	list_del_init(p);
-	spin_unlock(&rcu_torture_lock);
+	spin_unlock_bh(&rcu_torture_lock);
 	return container_of(p, struct rcu_torture, rtort_free);
 }
 
@@ -134,9 +134,9 @@ static void
 rcu_torture_free(struct rcu_torture *p)
 {
 	atomic_inc(&n_rcu_torture_free);
-	spin_lock(&rcu_torture_lock);
+	spin_lock_bh(&rcu_torture_lock);
 	list_add_tail(&p->rtort_free, &rcu_torture_freelist);
-	spin_unlock(&rcu_torture_lock);
+	spin_unlock_bh(&rcu_torture_lock);
 }
 
 static void
-- 
cgit v1.2.3-70-g09d2


From 3fa97c9db4f6f93f41f7a40d08872dbfd8dc907e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 31 Jan 2006 16:34:26 -0800
Subject: [PATCH] "Fix uidhash_lock <-> RXU deadlock" fix

I get storms of warnings from local_bh_enable().  Better-tested patches,
please.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/user.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index d1ae2349347..d9deae43a9a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -33,6 +33,10 @@ static struct list_head uidhash_table[UIDHASH_SZ];
  * The uidhash_lock is mostly taken from process context, but it is
  * occasionally also taken from softirq/tasklet context, when
  * task-structs get RCU-freed. Hence all locking must be softirq-safe.
+ * But free_uid() is also called with local interrupts disabled, and running
+ * local_bh_enable() with local interrupts disabled is an error - we'll run
+ * softirq callbacks, and they can unconditionally enable interrupts, and
+ * the caller of free_uid() didn't expect that..
  */
 static DEFINE_SPINLOCK(uidhash_lock);
 
@@ -89,16 +93,19 @@ static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *has
 struct user_struct *find_user(uid_t uid)
 {
 	struct user_struct *ret;
+	unsigned long flags;
 
-	spin_lock_bh(&uidhash_lock);
+	spin_lock_irqsave(&uidhash_lock, flags);
 	ret = uid_hash_find(uid, uidhashentry(uid));
-	spin_unlock_bh(&uidhash_lock);
+	spin_unlock_irqrestore(&uidhash_lock, flags);
 	return ret;
 }
 
 void free_uid(struct user_struct *up)
 {
-	local_bh_disable();
+	unsigned long flags;
+
+	local_irq_save(flags);
 	if (up && atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
 		uid_hash_remove(up);
 		key_put(up->uid_keyring);
@@ -106,7 +113,7 @@ void free_uid(struct user_struct *up)
 		kmem_cache_free(uid_cachep, up);
 		spin_unlock(&uidhash_lock);
 	}
-	local_bh_enable();
+	local_irq_restore(flags);
 }
 
 struct user_struct * alloc_uid(uid_t uid)
@@ -114,9 +121,9 @@ struct user_struct * alloc_uid(uid_t uid)
 	struct list_head *hashent = uidhashentry(uid);
 	struct user_struct *up;
 
-	spin_lock_bh(&uidhash_lock);
+	spin_lock_irq(&uidhash_lock);
 	up = uid_hash_find(uid, hashent);
-	spin_unlock_bh(&uidhash_lock);
+	spin_unlock_irq(&uidhash_lock);
 
 	if (!up) {
 		struct user_struct *new;
@@ -146,7 +153,7 @@ struct user_struct * alloc_uid(uid_t uid)
 		 * Before adding this, check whether we raced
 		 * on adding the same user already..
 		 */
-		spin_lock_bh(&uidhash_lock);
+		spin_lock_irq(&uidhash_lock);
 		up = uid_hash_find(uid, hashent);
 		if (up) {
 			key_put(new->uid_keyring);
@@ -156,7 +163,7 @@ struct user_struct * alloc_uid(uid_t uid)
 			uid_hash_insert(new, hashent);
 			up = new;
 		}
-		spin_unlock_bh(&uidhash_lock);
+		spin_unlock_irq(&uidhash_lock);
 
 	}
 	return up;
@@ -192,9 +199,9 @@ static int __init uid_cache_init(void)
 		INIT_LIST_HEAD(uidhash_table + n);
 
 	/* Insert the root user immediately (init already runs as root) */
-	spin_lock_bh(&uidhash_lock);
+	spin_lock_irq(&uidhash_lock);
 	uid_hash_insert(&root_user, uidhashentry(0));
-	spin_unlock_bh(&uidhash_lock);
+	spin_unlock_irq(&uidhash_lock);
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 853609b61ef88b414ffd1613741aa59894334320 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 1 Feb 2006 03:05:07 -0800
Subject: [PATCH] swsusp: use bytes as image size units

Make swsusp use bytes as the image size units, which is needed for future
compatibility.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/power/interface.txt | 2 +-
 Documentation/power/swsusp.txt    | 2 +-
 kernel/power/disk.c               | 6 +++---
 kernel/power/power.h              | 4 ++--
 kernel/power/swsusp.c             | 8 ++++----
 5 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/power/interface.txt b/Documentation/power/interface.txt
index bd4ffb5bd49..4117802af0f 100644
--- a/Documentation/power/interface.txt
+++ b/Documentation/power/interface.txt
@@ -44,7 +44,7 @@ it.
 /sys/power/image_size controls the size of the image created by
 the suspend-to-disk mechanism.  It can be written a string
 representing a non-negative integer that will be used as an upper
-limit of the image size, in megabytes.  The suspend-to-disk mechanism will
+limit of the image size, in bytes.  The suspend-to-disk mechanism will
 do its best to ensure the image size will not exceed that number.  However,
 if this turns out to be impossible, it will try to suspend anyway using the
 smallest image possible.  In particular, if "0" is written to this file, the
diff --git a/Documentation/power/swsusp.txt b/Documentation/power/swsusp.txt
index 08c79d4dc54..b28b7f04abb 100644
--- a/Documentation/power/swsusp.txt
+++ b/Documentation/power/swsusp.txt
@@ -27,7 +27,7 @@ echo shutdown > /sys/power/disk; echo disk > /sys/power/state
 
 echo platform > /sys/power/disk; echo disk > /sys/power/state
 
-If you want to limit the suspend image size to N megabytes, do
+If you want to limit the suspend image size to N bytes, do
 
 echo N > /sys/power/image_size
 
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index e24446f8d8c..f2b3b0ea512 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -367,14 +367,14 @@ power_attr(resume);
 
 static ssize_t image_size_show(struct subsystem * subsys, char *buf)
 {
-	return sprintf(buf, "%u\n", image_size);
+	return sprintf(buf, "%lu\n", image_size);
 }
 
 static ssize_t image_size_store(struct subsystem * subsys, const char * buf, size_t n)
 {
-	unsigned int size;
+	unsigned long size;
 
-	if (sscanf(buf, "%u", &size) == 1) {
+	if (sscanf(buf, "%lu", &size) == 1) {
 		image_size = size;
 		return n;
 	}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 7e8492fd142..61beb5e0e92 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -51,8 +51,8 @@ extern const void __nosave_begin, __nosave_end;
 extern unsigned int nr_copy_pages;
 extern struct pbe *pagedir_nosave;
 
-/* Preferred image size in MB (default 500) */
-extern unsigned int image_size;
+/* Preferred image size in bytes (default 500 MB) */
+extern unsigned long image_size;
 
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 55a18d26abe..59c91c148e8 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -70,12 +70,12 @@
 #include "power.h"
 
 /*
- * Preferred image size in MB (tunable via /sys/power/image_size).
+ * Preferred image size in bytes (tunable via /sys/power/image_size).
  * When it is set to N, swsusp will do its best to ensure the image
- * size will not exceed N MB, but if that is impossible, it will
+ * size will not exceed N bytes, but if that is impossible, it will
  * try to create the smallest image possible.
  */
-unsigned int image_size = 500;
+unsigned long image_size = 500 * 1024 * 1024;
 
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void);
@@ -590,7 +590,7 @@ int swsusp_shrink_memory(void)
 			if (!tmp)
 				return -ENOMEM;
 			pages += tmp;
-		} else if (size > (image_size * 1024 * 1024) / PAGE_SIZE) {
+		} else if (size > image_size / PAGE_SIZE) {
 			tmp = shrink_all_memory(SHRINK_BITE);
 			pages += tmp;
 		}
-- 
cgit v1.2.3-70-g09d2


From bc1978d404befacd272d0321ef749cc3192e488b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 1 Feb 2006 03:05:08 -0800
Subject: [PATCH] hrtimers: fixup itimer conversion

The itimer conversion removed the locking which protects the timer and
variables in the shared signal structure.  Steven Rostedt found the problem in
the latest -rt patches.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/itimer.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/itimer.c b/kernel/itimer.c
index c2c05c4ff28..6433d068550 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -49,9 +49,11 @@ int do_getitimer(int which, struct itimerval *value)
 
 	switch (which) {
 	case ITIMER_REAL:
+		spin_lock_irq(&tsk->sighand->siglock);
 		value->it_value = itimer_get_remtime(&tsk->signal->real_timer);
 		value->it_interval =
 			ktime_to_timeval(tsk->signal->it_real_incr);
+		spin_unlock_irq(&tsk->sighand->siglock);
 		break;
 	case ITIMER_VIRTUAL:
 		read_lock(&tasklist_lock);
@@ -150,8 +152,14 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 
 	switch (which) {
 	case ITIMER_REAL:
+again:
+		spin_lock_irq(&tsk->sighand->siglock);
 		timer = &tsk->signal->real_timer;
-		hrtimer_cancel(timer);
+		/* We are sharing ->siglock with it_real_fn() */
+		if (hrtimer_try_to_cancel(timer) < 0) {
+			spin_unlock_irq(&tsk->sighand->siglock);
+			goto again;
+		}
 		if (ovalue) {
 			ovalue->it_value = itimer_get_remtime(timer);
 			ovalue->it_interval
@@ -162,6 +170,7 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 		expires = timeval_to_ktime(value->it_value);
 		if (expires.tv64 != 0)
 			hrtimer_start(timer, expires, HRTIMER_REL);
+		spin_unlock_irq(&tsk->sighand->siglock);
 		break;
 	case ITIMER_VIRTUAL:
 		nval = timeval_to_cputime(&value->it_value);
-- 
cgit v1.2.3-70-g09d2


From b6557fbca805217588a412f391a65ceafcf1a1af Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 1 Feb 2006 03:05:09 -0800
Subject: [PATCH] hrtimers: fix possible use of NULL pointer in posix-timers

Fixup the conversion of posix-timers to hrtimers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 197208b3aa2..3b606d361b5 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -290,7 +290,8 @@ void do_schedule_next_timer(struct siginfo *info)
 		info->si_overrun = timr->it_overrun_last;
 	}
 
-	unlock_timer(timr, flags);
+	if (timr)
+		unlock_timer(timr, flags);
 }
 
 int posix_timer_event(struct k_itimer *timr,int si_private)
-- 
cgit v1.2.3-70-g09d2


From a16a1c095a2392d49fafea22f3a508e268ef7167 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 1 Feb 2006 03:05:09 -0800
Subject: [PATCH] hrtimers: fix oldvalue return in setitimer

This resolves bugzilla bug#5617.  The oldvalue of the timer was read after the
timer was cancelled, so the remaining time was always zero.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/itimer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/itimer.c b/kernel/itimer.c
index 6433d068550..379be2f8c84 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -155,16 +155,16 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 again:
 		spin_lock_irq(&tsk->sighand->siglock);
 		timer = &tsk->signal->real_timer;
-		/* We are sharing ->siglock with it_real_fn() */
-		if (hrtimer_try_to_cancel(timer) < 0) {
-			spin_unlock_irq(&tsk->sighand->siglock);
-			goto again;
-		}
 		if (ovalue) {
 			ovalue->it_value = itimer_get_remtime(timer);
 			ovalue->it_interval
 				= ktime_to_timeval(tsk->signal->it_real_incr);
 		}
+		/* We are sharing ->siglock with it_real_fn() */
+		if (hrtimer_try_to_cancel(timer) < 0) {
+			spin_unlock_irq(&tsk->sighand->siglock);
+			goto again;
+		}
 		tsk->signal->it_real_incr =
 			timeval_to_ktime(value->it_interval);
 		expires = timeval_to_ktime(value->it_value);
-- 
cgit v1.2.3-70-g09d2


From ff60a5dc4fa584d47022d2533bc5c53b80096fb5 Mon Sep 17 00:00:00 2001
From: "akpm@osdl.org" <akpm@osdl.org>
Date: Wed, 1 Feb 2006 03:05:10 -0800
Subject: [PATCH] hrtimers: fix posix-timer requeue race

From: Steven Rostedtrostedt@goodmis.org <rostedt@goodmis.org>

CPU0 expires a posix-timer and runs the callback function.  The signal is
queued.

After releasing the posix-timer lock and before returning to hrtimer_run_queue
CPU0 gets interrupted.  CPU1 delivers the queued signal and rearms the timer.
CPU0 comes back to hrtimer_run_queue and sets the timer state to expired.

The next modification of the timer can result in an oops, because the state
information is wrong.

Keep track of state = RUNNING and check if the state has been in the return
path of hrtimer_run_queue.  In case the state has been changed, ignore a
restart request and do not touch the state variable.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h | 1 +
 kernel/hrtimer.c        | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 089bfb1fa01..c657f3d4924 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -40,6 +40,7 @@ enum hrtimer_restart {
 enum hrtimer_state {
 	HRTIMER_INACTIVE,	/* Timer is inactive */
 	HRTIMER_EXPIRED,		/* Timer is expired */
+	HRTIMER_RUNNING,		/* Timer is running the callback function */
 	HRTIMER_PENDING,		/* Timer is pending */
 };
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f1c4155b49a..f580dd9db28 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -550,6 +550,7 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 		fn = timer->function;
 		data = timer->data;
 		set_curr_timer(base, timer);
+		timer->state = HRTIMER_RUNNING;
 		__remove_hrtimer(timer, base);
 		spin_unlock_irq(&base->lock);
 
@@ -565,6 +566,10 @@ static inline void run_hrtimer_queue(struct hrtimer_base *base)
 
 		spin_lock_irq(&base->lock);
 
+		/* Another CPU has added back the timer */
+		if (timer->state != HRTIMER_RUNNING)
+			continue;
+
 		if (restart == HRTIMER_RESTART)
 			enqueue_hrtimer(timer, base);
 		else
-- 
cgit v1.2.3-70-g09d2


From 7978672c4d9a1e6a6081de3a9d9ba5e5b24904a0 Mon Sep 17 00:00:00 2001
From: George Anzinger <george@wildturkeyranch.net>
Date: Wed, 1 Feb 2006 03:05:11 -0800
Subject: [PATCH] hrtimers: cleanups and simplifications

Clean up the interface to hrtimers by changing the init code to pass the mode
as well as the clock.  This allow the init code to select the correct base and
eliminates extra timer re-init code in posix-timers.  We also simplify the
restart interface nanosleep use.

Signed-off-by: George Anzinger <george@mvista.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hrtimer.h |  5 ++---
 kernel/fork.c           |  2 +-
 kernel/hrtimer.c        | 59 ++++++++++++++++++++-----------------------------
 kernel/posix-timers.c   | 37 ++++++++-----------------------
 4 files changed, 36 insertions(+), 67 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index c657f3d4924..6361544bb6a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -101,9 +101,8 @@ struct hrtimer_base {
 /* Exported timer functions: */
 
 /* Initialize timers: */
-extern void hrtimer_init(struct hrtimer *timer, const clockid_t which_clock);
-extern void hrtimer_rebase(struct hrtimer *timer, const clockid_t which_clock);
-
+extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock,
+			 enum hrtimer_mode mode);
 
 /* Basic timer operations: */
 extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
diff --git a/kernel/fork.c b/kernel/fork.c
index 4ae8cfc1c89..7f0ab5ee948 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -802,7 +802,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
 
-	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC);
+	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_REL);
 	sig->it_real_incr.tv64 = 0;
 	sig->real_timer.function = it_real_fn;
 	sig->real_timer.data = tsk;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f580dd9db28..efff9496b2f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -66,6 +66,12 @@ EXPORT_SYMBOL_GPL(ktime_get_real);
 
 /*
  * The timer bases:
+ *
+ * Note: If we want to add new timer bases, we have to skip the two
+ * clock ids captured by the cpu-timers. We do this by holding empty
+ * entries rather than doing math adjustment of the clock ids.
+ * This ensures that we capture erroneous accesses to these clock ids
+ * rather than moving them into the range of valid clock id's.
  */
 
 #define MAX_HRTIMER_BASES 2
@@ -483,29 +489,25 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 }
 
 /**
- * hrtimer_rebase - rebase an initialized hrtimer to a different base
+ * hrtimer_init - initialize a timer to the given clock
  *
- * @timer:	the timer to be rebased
+ * @timer:	the timer to be initialized
  * @clock_id:	the clock to be used
+ * @mode:	timer mode abs/rel
  */
-void hrtimer_rebase(struct hrtimer *timer, const clockid_t clock_id)
+void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
+		  enum hrtimer_mode mode)
 {
 	struct hrtimer_base *bases;
 
+	memset(timer, 0, sizeof(struct hrtimer));
+
 	bases = per_cpu(hrtimer_bases, raw_smp_processor_id());
-	timer->base = &bases[clock_id];
-}
 
-/**
- * hrtimer_init - initialize a timer to the given clock
- *
- * @timer:	the timer to be initialized
- * @clock_id:	the clock to be used
- */
-void hrtimer_init(struct hrtimer *timer, const clockid_t clock_id)
-{
-	memset(timer, 0, sizeof(struct hrtimer));
-	hrtimer_rebase(timer, clock_id);
+	if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS)
+		clock_id = CLOCK_MONOTONIC;
+
+	timer->base = &bases[clock_id];
 }
 
 /**
@@ -643,8 +645,7 @@ schedule_hrtimer_interruptible(struct hrtimer *timer,
 	return schedule_hrtimer(timer, mode);
 }
 
-static long __sched
-nanosleep_restart(struct restart_block *restart, clockid_t clockid)
+static long __sched nanosleep_restart(struct restart_block *restart)
 {
 	struct timespec __user *rmtp;
 	struct timespec tu;
@@ -654,7 +655,7 @@ nanosleep_restart(struct restart_block *restart, clockid_t clockid)
 
 	restart->fn = do_no_restart_syscall;
 
-	hrtimer_init(&timer, clockid);
+	hrtimer_init(&timer, (clockid_t) restart->arg3, HRTIMER_ABS);
 
 	timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0;
 
@@ -674,16 +675,6 @@ nanosleep_restart(struct restart_block *restart, clockid_t clockid)
 	return -ERESTART_RESTARTBLOCK;
 }
 
-static long __sched nanosleep_restart_mono(struct restart_block *restart)
-{
-	return nanosleep_restart(restart, CLOCK_MONOTONIC);
-}
-
-static long __sched nanosleep_restart_real(struct restart_block *restart)
-{
-	return nanosleep_restart(restart, CLOCK_REALTIME);
-}
-
 long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 		       const enum hrtimer_mode mode, const clockid_t clockid)
 {
@@ -692,7 +683,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 	struct timespec tu;
 	ktime_t rem;
 
-	hrtimer_init(&timer, clockid);
+	hrtimer_init(&timer, clockid, mode);
 
 	timer.expires = timespec_to_ktime(*rqtp);
 
@@ -700,7 +691,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 	if (rem.tv64 <= 0)
 		return 0;
 
-	/* Absolute timers do not update the rmtp value: */
+	/* Absolute timers do not update the rmtp value and restart: */
 	if (mode == HRTIMER_ABS)
 		return -ERESTARTNOHAND;
 
@@ -710,11 +701,11 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
 		return -EFAULT;
 
 	restart = &current_thread_info()->restart_block;
-	restart->fn = (clockid == CLOCK_MONOTONIC) ?
-		nanosleep_restart_mono : nanosleep_restart_real;
+	restart->fn = nanosleep_restart;
 	restart->arg0 = timer.expires.tv64 & 0xFFFFFFFF;
 	restart->arg1 = timer.expires.tv64 >> 32;
 	restart->arg2 = (unsigned long) rmtp;
+	restart->arg3 = (unsigned long) timer.base->index;
 
 	return -ERESTART_RESTARTBLOCK;
 }
@@ -741,10 +732,8 @@ static void __devinit init_hrtimers_cpu(int cpu)
 	struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
 	int i;
 
-	for (i = 0; i < MAX_HRTIMER_BASES; i++) {
+	for (i = 0; i < MAX_HRTIMER_BASES; i++, base++)
 		spin_lock_init(&base->lock);
-		base++;
-	}
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 3b606d361b5..28e72fd0029 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -194,9 +194,7 @@ static inline int common_clock_set(const clockid_t which_clock,
 
 static int common_timer_create(struct k_itimer *new_timer)
 {
-	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock);
-	new_timer->it.real.timer.data = new_timer;
-	new_timer->it.real.timer.function = posix_timer_fn;
+	hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
 	return 0;
 }
 
@@ -693,6 +691,7 @@ common_timer_set(struct k_itimer *timr, int flags,
 		 struct itimerspec *new_setting, struct itimerspec *old_setting)
 {
 	struct hrtimer *timer = &timr->it.real.timer;
+	enum hrtimer_mode mode;
 
 	if (old_setting)
 		common_timer_get(timr, old_setting);
@@ -714,14 +713,10 @@ common_timer_set(struct k_itimer *timr, int flags,
 	if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
 		return 0;
 
-	/* Posix madness. Only absolute CLOCK_REALTIME timers
-	 * are affected by clock sets. So we must reiniatilize
-	 * the timer.
-	 */
-	if (timr->it_clock == CLOCK_REALTIME && (flags & TIMER_ABSTIME))
-		hrtimer_rebase(timer, CLOCK_REALTIME);
-	else
-		hrtimer_rebase(timer, CLOCK_MONOTONIC);
+	mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
+	hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
+	timr->it.real.timer.data = timr;
+	timr->it.real.timer.function = posix_timer_fn;
 
 	timer->expires = timespec_to_ktime(new_setting->it_value);
 
@@ -732,8 +727,7 @@ common_timer_set(struct k_itimer *timr, int flags,
 	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
 		return 0;
 
-	hrtimer_start(timer, timer->expires, (flags & TIMER_ABSTIME) ?
-		      HRTIMER_ABS : HRTIMER_REL);
+	hrtimer_start(timer, timer->expires, mode);
 	return 0;
 }
 
@@ -948,21 +942,8 @@ sys_clock_getres(const clockid_t which_clock, struct timespec __user *tp)
 static int common_nsleep(const clockid_t which_clock, int flags,
 			 struct timespec *tsave, struct timespec __user *rmtp)
 {
-	int mode = flags & TIMER_ABSTIME ? HRTIMER_ABS : HRTIMER_REL;
-	int clockid = which_clock;
-
-	switch (which_clock) {
-	case CLOCK_REALTIME:
-		/* Posix madness. Only absolute timers on clock realtime
-		   are affected by clock set. */
-		if (mode != HRTIMER_ABS)
-			clockid = CLOCK_MONOTONIC;
-	case CLOCK_MONOTONIC:
-		break;
-	default:
-		return -EINVAL;
-	}
-	return hrtimer_nanosleep(tsave, rmtp, mode, clockid);
+	return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ?
+				 HRTIMER_ABS : HRTIMER_REL, which_clock);
 }
 
 asmlinkage long
-- 
cgit v1.2.3-70-g09d2


From 66188fae3bf7f8dd951e2291d2a81888ed1b65de Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 1 Feb 2006 03:05:13 -0800
Subject: [PATCH] hrtimers: add back lost credit lines

At some point we added credits to people who actively helped to bring
k/hr-timers along.  This was lost in the big code revamp.  Add it back.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/ktime.h | 6 ++++++
 kernel/hrtimer.c      | 6 ++++++
 2 files changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index 1bd6552cc34..6aca67a569a 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -10,6 +10,12 @@
  *
  *  Started by: Thomas Gleixner and Ingo Molnar
  *
+ *  Credits:
+ *
+ *  	Roman Zippel provided the ideas and primary code snippets of
+ *  	the ktime_t union and further simplifications of the original
+ *  	code.
+ *
  *  For licencing details see kernel-base/COPYING
  */
 #ifndef _LINUX_KTIME_H
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index efff9496b2f..2b6e1757aed 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -21,6 +21,12 @@
  *  Credits:
  *	based on kernel/timer.c
  *
+ *	Help, testing, suggestions, bugfixes, improvements were
+ *	provided by:
+ *
+ *	George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel
+ *	et. al.
+ *
  *  For licencing details see kernel-base/COPYING
  */
 
-- 
cgit v1.2.3-70-g09d2


From 952bbc87f01f552ef091a62ea2a721b5b2670e74 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 1 Feb 2006 03:05:13 -0800
Subject: [PATCH] hrtimers: set correct initial expiry time for relative
 SIGEV_NONE timers

The expiry time for relative timers with SIGEV_NONE set was never
updated to the correct value.

Pointed out by George Anzinger.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 28e72fd0029..aad6f138d5c 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -724,8 +724,13 @@ common_timer_set(struct k_itimer *timr, int flags,
 	timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
 
 	/* SIGEV_NONE timers are not queued ! See common_timer_get */
-	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
+	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
+		/* Setup correct expiry time for relative timers */
+		if (mode == HRTIMER_REL)
+			timer->expires = ktime_add(timer->expires,
+						   timer->base->get_time());
 		return 0;
+	}
 
 	hrtimer_start(timer, timer->expires, mode);
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 493f01d1d0699ddafc30067d33fcc18d0b95b624 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Wed, 1 Feb 2006 03:05:14 -0800
Subject: [PATCH] kernel/posix-timers.c: remove do_posix_clock_notimer_create()

This function is neither used nor has any real contents.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/posix-timers.h | 1 -
 kernel/posix-timers.c        | 6 ------
 2 files changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 54faf5236da..95572c434bc 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -84,7 +84,6 @@ struct k_clock {
 void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock);
 
 /* error handlers for timer_create, nanosleep and settime */
-int do_posix_clock_notimer_create(struct k_itimer *timer);
 int do_posix_clock_nonanosleep(const clockid_t, int flags, struct timespec *,
 			       struct timespec __user *);
 int do_posix_clock_nosettime(const clockid_t, struct timespec *tp);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index aad6f138d5c..216f574b5ff 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -875,12 +875,6 @@ int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
 }
 EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
 
-int do_posix_clock_notimer_create(struct k_itimer *timer)
-{
-	return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create);
-
 int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
 			       struct timespec *t, struct timespec __user *r)
 {
-- 
cgit v1.2.3-70-g09d2


From 2f7016d917faef8f1e016b4a7bd7f594694480b6 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Wed, 1 Feb 2006 03:05:18 -0800
Subject: [PATCH] sys_sched_getaffinity() & hotplug

Change sched_getaffinity() so that it returns a bitmap that indicates the
legally schedulable cpus that a task is allowed to run on.

Without this patch, if CONFIG_HOTPLUG_CPU is enabled, sched_getaffinity()
unconditionally returns (at least on IA64) a mask with NR_CPUS bits set.
This conveys no useful infornmation except for a kernel compile option.

This fixes a breakage we obseved running recent kernels. We have MPI jobs
that use sched_getaffinity() to determine where to place their threads.
Placing them on non-existant cpus is problematic :-)

Signed-off-by: Jack Steiner <steiner@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nathan Lynch <ntl@pobox.com>
Cc: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ec7fd9cee30..f77f23f8f47 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4031,7 +4031,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 		goto out_unlock;
 
 	retval = 0;
-	cpus_and(*mask, p->cpus_allowed, cpu_possible_map);
+	cpus_and(*mask, p->cpus_allowed, cpu_online_map);
 
 out_unlock:
 	read_unlock(&tasklist_lock);
-- 
cgit v1.2.3-70-g09d2


From f7b8988ff50d99c99746f65f420364e91362c065 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 1 Feb 2006 03:05:21 -0800
Subject: [PATCH] swsusp: do not change log level during suspend/resume

Prevent the kernel from setting the log level to 10 unconditionally during
suspend/resume which was needed in the past for debugging, but generally is
undesirable.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/console.c | 12 +-----------
 kernel/power/power.h   |  5 +++++
 2 files changed, 6 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/console.c b/kernel/power/console.c
index 7ff375e7c95..579d239d129 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -9,18 +9,11 @@
 #include <linux/console.h>
 #include "power.h"
 
-static int new_loglevel = 10;
-static int orig_loglevel;
 #ifdef SUSPEND_CONSOLE
 static int orig_fgconsole, orig_kmsg;
-#endif
 
 int pm_prepare_console(void)
 {
-	orig_loglevel = console_loglevel;
-	console_loglevel = new_loglevel;
-
-#ifdef SUSPEND_CONSOLE
 	acquire_console_sem();
 
 	orig_fgconsole = fg_console;
@@ -41,18 +34,15 @@ int pm_prepare_console(void)
 	}
 	orig_kmsg = kmsg_redirect;
 	kmsg_redirect = SUSPEND_CONSOLE;
-#endif
 	return 0;
 }
 
 void pm_restore_console(void)
 {
-	console_loglevel = orig_loglevel;
-#ifdef SUSPEND_CONSOLE
 	acquire_console_sem();
 	set_console(orig_fgconsole);
 	release_console_sem();
 	kmsg_redirect = orig_kmsg;
-#endif
 	return;
 }
+#endif
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 61beb5e0e92..d8f0d1a76ba 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -42,8 +42,13 @@ static struct subsys_attribute _name##_attr = {	\
 
 extern struct subsystem power_subsys;
 
+#ifdef SUSPEND_CONSOLE
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
+#else
+static int pm_prepare_console(void) { return 0; }
+static void pm_restore_console(void) {}
+#endif
 
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
-- 
cgit v1.2.3-70-g09d2


From c84db23c6e587d3ab00a41c51fedf758e1f6ecd4 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:29 -0800
Subject: [PATCH] zone_reclaim: minor fixes

- If we only reclaim nr_pages then its okay to stay on node.
  Switch from > to >= for the comparison.

- vm_table[] entry for zone_reclaim_mode is a bit screwed up.

- Add empty lines around shrink_zone to show that this is the
  central function to be called.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 3 ++-
 mm/vmscan.c     | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index cb99a42f8b3..c74f03bc014 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -878,7 +878,8 @@ static ctl_table vm_table[] = {
 		.maxlen		= sizeof(zone_reclaim_mode),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
-		.strategy	= &zero,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
 	},
 #endif
 	{ .ctl_name = 0 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a29efb2c06c..61ca0097c83 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1636,14 +1636,16 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	p->flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
+
 	shrink_zone(zone, &sc);
+
 	p->reclaim_state = NULL;
 	current->flags &= ~PF_MEMALLOC;
 
 	if (sc.nr_reclaimed == 0)
 		zone->last_unsuccessful_zone_reclaim = jiffies;
 
-	return sc.nr_reclaimed > nr_pages;
+	return sc.nr_reclaimed >= nr_pages;
 }
 #endif
 
-- 
cgit v1.2.3-70-g09d2


From 2a11ff06d7d12be5d1bbcf592fff649b45ac2388 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 1 Feb 2006 03:05:33 -0800
Subject: [PATCH] zone_reclaim: configurable off node allocation period.

Currently the zone_reclaim code has a fixed window of 30 seconds of off node
allocations should a local zone have no unused pagecache pages left.  Reclaim
will be attempted again after this timeout period to avoid repeated useless
scans for memory.  This is also useful to established sufficiently large off
node allocation chunks to relieve the local node.

It may be beneficial to adjust that time period for some special situations.
For example if memory use was exceeding node capacity one may want to give up
for longer periods of time.  If memory spikes intermittendly then one may want
to shorten the time period to reduce the number of off node allocations.

This patch allows just that....

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/vm.txt | 12 ++++++++++++
 include/linux/swap.h        |  1 +
 include/linux/sysctl.h      |  3 ++-
 kernel/sysctl.c             |  9 +++++++++
 mm/vmscan.c                 |  4 ++--
 5 files changed, 26 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 391dd64363e..44518c02394 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -28,6 +28,7 @@ Currently, these files are in /proc/sys/vm:
 - block_dump
 - drop-caches
 - zone_reclaim_mode
+- zone_reclaim_interval
 
 ==============================================================
 
@@ -137,4 +138,15 @@ of memory should be used for caching files from disk.
 
 It may be beneficial to switch this on if one wants to do zone
 reclaim regardless of the numa distances in the system.
+================================================================
+
+zone_reclaim_interval:
+
+The time allowed for off node allocations after zone reclaim
+has failed to reclaim enough pages to allow a local allocation.
+
+Time is set in seconds and set by default to 30 seconds.
+
+Reduce the interval if undesired off node allocations occur. However, too
+frequent scans will have a negative impact onoff node allocation performance.
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 4a99e4a7fbf..e53fef7051e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -178,6 +178,7 @@ extern int vm_swappiness;
 
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
+extern int zone_reclaim_interval;
 extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
 #else
 #define zone_reclaim_mode 0
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 8352a7ce589..32a4139c4ad 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -182,7 +182,8 @@ enum
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
 	VM_DROP_PAGECACHE=29,	/* int: nuke lots of pagecache */
 	VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
-	VM_ZONE_RECLAIM_MODE=31,/* reclaim local zone memory before going off node */
+	VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */
+	VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c74f03bc014..71dd6f62efe 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -881,6 +881,15 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
+	{
+		.ctl_name	= VM_ZONE_RECLAIM_INTERVAL,
+		.procname	= "zone_reclaim_interval",
+		.data		= &zone_reclaim_interval,
+		.maxlen		= sizeof(zone_reclaim_interval),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
 #endif
 	{ .ctl_name = 0 }
 };
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f8b94ea6f72..8760a4abfa1 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1595,7 +1595,7 @@ int zone_reclaim_mode __read_mostly;
 /*
  * Mininum time between zone reclaim scans
  */
-#define ZONE_RECLAIM_INTERVAL 30*HZ
+int zone_reclaim_interval __read_mostly = 30*HZ;
 
 /*
  * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -1617,7 +1617,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	int node_id;
 
 	if (time_before(jiffies,
-		zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
+		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
 			return 0;
 
 	if (!(gfp_mask & __GFP_WAIT) ||
-- 
cgit v1.2.3-70-g09d2


From e65cefe87beda627c0bfba39b387ee4bffedc93c Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Fri, 3 Feb 2006 03:03:42 -0800
Subject: [PATCH] kernel/kprobes.c: fix a warning #ifndef
 ARCH_SUPPORTS_KRETPROBES

kernel/kprobes.c:353: warning: 'pre_handler_kretprobe' defined but not used

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: "Keshavamurthy, Anil S" <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3ea6325228d..95ad7f8db3d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -344,23 +344,6 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
 	spin_unlock_irqrestore(&kretprobe_lock, flags);
 }
 
-/*
- * This kprobe pre_handler is registered with every kretprobe. When probe
- * hits it will set up the return probe.
- */
-static int __kprobes pre_handler_kretprobe(struct kprobe *p,
-					   struct pt_regs *regs)
-{
-	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
-	unsigned long flags = 0;
-
-	/*TODO: consider to only swap the RA after the last pre_handler fired */
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	arch_prepare_kretprobe(rp, regs);
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
-	return 0;
-}
-
 static inline void free_rp_inst(struct kretprobe *rp)
 {
 	struct kretprobe_instance *ri;
@@ -578,6 +561,23 @@ void __kprobes unregister_jprobe(struct jprobe *jp)
 
 #ifdef ARCH_SUPPORTS_KRETPROBES
 
+/*
+ * This kprobe pre_handler is registered with every kretprobe. When probe
+ * hits it will set up the return probe.
+ */
+static int __kprobes pre_handler_kretprobe(struct kprobe *p,
+					   struct pt_regs *regs)
+{
+	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+	unsigned long flags = 0;
+
+	/*TODO: consider to only swap the RA after the last pre_handler fired */
+	spin_lock_irqsave(&kretprobe_lock, flags);
+	arch_prepare_kretprobe(rp, regs);
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
+	return 0;
+}
+
 int __kprobes register_kretprobe(struct kretprobe *rp)
 {
 	int ret = 0;
-- 
cgit v1.2.3-70-g09d2


From 278ff9537030bbb292b33504f5e1f6e0126793eb Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Fri, 3 Feb 2006 03:03:43 -0800
Subject: [PATCH] Kprobes: Fix deadlock in function-return probes

When two function-return probes are inserted on kfree()[1] and the second
on say, sys_link()[2], and later [2] is unregistered, we have a deadlock as
kfree is called with the kretprobe_lock held and the function-return probe
on kfree will also try to grab the same lock.

However, we can move the kfree() during unregistration to outside the
spinlock as we are sure that no instances from the free list will be used
after synchronized_sched() returns during the unregistration process.
Thanks to Masami Hiramatsu for spotting this.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kprobes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 95ad7f8db3d..fef1af8a73c 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -631,12 +631,12 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp)
 	unregister_kprobe(&rp->kp);
 	/* No race here */
 	spin_lock_irqsave(&kretprobe_lock, flags);
-	free_rp_inst(rp);
 	while ((ri = get_used_rp_inst(rp)) != NULL) {
 		ri->rp = NULL;
 		hlist_del(&ri->uflist);
 	}
 	spin_unlock_irqrestore(&kretprobe_lock, flags);
+	free_rp_inst(rp);
 }
 
 static int __init init_kprobes(void)
-- 
cgit v1.2.3-70-g09d2


From 54e8ce463a7e21dbe9dad57723ed47653ee5db15 Mon Sep 17 00:00:00 2001
From: Keith Owens <kaos@sgi.com>
Date: Fri, 3 Feb 2006 03:03:53 -0800
Subject: [PATCH] Tell kallsyms_lookup_name() to ignore type U entries

When one module exports a function symbol and another module uses that
symbol then kallsyms shows the symbol twice.  Once from the consumer with a
type of 'U' and once from the provider with a type of 't' or 'T'.  On most
architectures, both entries have the same address so it does not matter
which one is returned by kallsyms_lookup_name().  But on architectures with
function descriptors, the 'U' entry points to the descriptor, not to the
code body, which is not what we want.

IA64 # grep -w qla2x00_remove_one /proc/kallsyms
a000000208c25ef8 U qla2x00_remove_one   [qla2300]   <= descriptor
a000000208bf44c0 t qla2x00_remove_one   [qla2xxx]   <= function body

Tell kallsyms_lookup_name() to ignore type U entries in modules.

Signed-off-by: Keith Owens <kaos@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 618ed6e23ec..e058aedf6b9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2092,7 +2092,8 @@ static unsigned long mod_find_symname(struct module *mod, const char *name)
 	unsigned int i;
 
 	for (i = 0; i < mod->num_symtab; i++)
-		if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0)
+		if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 &&
+		    mod->symtab[i].st_info != 'U')
 			return mod->symtab[i].st_value;
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 88fc3897e3219e63ae6e2d180a6c87d033ef9f3b Mon Sep 17 00:00:00 2001
From: George Anzinger <george@wildturkeyranch.net>
Date: Fri, 3 Feb 2006 03:04:20 -0800
Subject: [PATCH] Normalize timespec for negative values in ns_to_timespec

- In case of a negative nsec value the result of the division must be
  normalized.

- Remove inline from an exported function.

Signed-off-by: George Anzinger <george@wildturkeyranch.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index 1f23e683d6a..804539165d8 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -637,15 +637,16 @@ void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec)
  *
  * Returns the timespec representation of the nsec parameter.
  */
-inline struct timespec ns_to_timespec(const nsec_t nsec)
+struct timespec ns_to_timespec(const nsec_t nsec)
 {
 	struct timespec ts;
 
-	if (nsec)
-		ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC,
-						     &ts.tv_nsec);
-	else
-		ts.tv_sec = ts.tv_nsec = 0;
+	if (!nsec)
+		return (struct timespec) {0, 0};
+
+	ts.tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC, &ts.tv_nsec);
+	if (unlikely(nsec < 0))
+		set_normalized_timespec(&ts, ts.tv_sec, ts.tv_nsec);
 
 	return ts;
 }
-- 
cgit v1.2.3-70-g09d2


From fe85a998ca64a067e58ca9240ec54a95994d78ee Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Fri, 3 Feb 2006 03:04:23 -0800
Subject: [PATCH] cpuset: fix sparse warning

kernel/cpuset.c:644:38: warning: non-ANSI function declaration of function 'cpuset_update_task_memory_state'

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index fe2f71f92ae..ba42b0a7696 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -641,7 +641,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * task has been modifying its cpuset.
  */
 
-void cpuset_update_task_memory_state()
+void cpuset_update_task_memory_state(void)
 {
 	int my_cpusets_mem_gen;
 	struct task_struct *tsk = current;
-- 
cgit v1.2.3-70-g09d2


From 514a01b880d28a3029d9e35de72ad8d2f95b31d0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 3 Feb 2006 03:04:41 -0800
Subject: [PATCH] uninline __sigqueue_free()

Five callsites.  I dunno how all this crap got back in there :(

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index d3efafd8109..b373fc2420d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -283,7 +283,7 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 	return(q);
 }
 
-static inline void __sigqueue_free(struct sigqueue *q)
+static void __sigqueue_free(struct sigqueue *q)
 {
 	if (q->flags & SIGQUEUE_PREALLOC)
 		return;
-- 
cgit v1.2.3-70-g09d2


From 88a2a4ac6b671a4b0dd5d2d762418904c05f4104 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sat, 4 Feb 2006 23:27:36 -0800
Subject: [PATCH] percpu data: only iterate over possible CPUs

percpu_data blindly allocates bootmem memory to store NR_CPUS instances of
cpudata, instead of allocating memory only for possible cpus.

As a preparation for changing that, we need to convert various 0 -> NR_CPUS
loops to use for_each_cpu().

(The above only applies to users of asm-generic/percpu.h.  powerpc has gone it
alone and is presently only allocating memory for present CPUs, so it's
currently corrupting memory).

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: James Bottomley <James.Bottomley@steeleye.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Jens Axboe <axboe@suse.de>
Cc: Anton Blanchard <anton@samba.org>
Acked-by: William Irwin <wli@holomorphy.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/nmi.c |  2 +-
 block/ll_rw_blk.c      |  2 +-
 drivers/scsi/scsi.c    |  2 +-
 fs/file.c              |  3 +--
 kernel/sched.c         |  2 +-
 mm/page_alloc.c        | 10 ++++++----
 net/core/dev.c         |  2 +-
 net/core/utils.c       |  4 ++--
 net/ipv4/proc.c        |  2 +-
 net/ipv6/proc.c        |  2 +-
 net/socket.c           |  2 +-
 11 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index d661703ac1c..63f39a7e2c9 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -138,7 +138,7 @@ static int __init check_nmi_watchdog(void)
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for_each_cpu(cpu)
 		prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
 	local_irq_enable();
 	mdelay((10*1000)/nmi_hz); // wait 10 ticks
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index f9fc07efd2d..e5aad831458 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3453,7 +3453,7 @@ int __init blk_dev_init(void)
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
 			sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
 
-	for (i = 0; i < NR_CPUS; i++)
+	for_each_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 
 	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 245ca99a641..c551bb84dbf 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -1245,7 +1245,7 @@ static int __init init_scsi(void)
 	if (error)
 		goto cleanup_sysctl;
 
-	for (i = 0; i < NR_CPUS; i++)
+	for_each_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(scsi_done_q, i));
 
 	devfs_mk_dir("scsi");
diff --git a/fs/file.c b/fs/file.c
index fd066b261c7..cea7cbea11d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -379,7 +379,6 @@ static void __devinit fdtable_defer_list_init(int cpu)
 void __init files_defer_init(void)
 {
 	int i;
-	/* Really early - can't use for_each_cpu */
-	for (i = 0; i < NR_CPUS; i++)
+	for_each_cpu(i)
 		fdtable_defer_list_init(i);
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index f77f23f8f47..839466fdfb4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6109,7 +6109,7 @@ void __init sched_init(void)
 	runqueue_t *rq;
 	int i, j, k;
 
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		prio_array_t *array;
 
 		rq = cpu_rq(i);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44b4eb4202d..dde04ff4be3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1213,18 +1213,21 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
 	int cpu = 0;
 
-	memset(ret, 0, sizeof(*ret));
+	memset(ret, 0, nr * sizeof(unsigned long));
 	cpus_and(*cpumask, *cpumask, cpu_online_map);
 
 	cpu = first_cpu(*cpumask);
 	while (cpu < NR_CPUS) {
 		unsigned long *in, *out, off;
 
+		if (!cpu_isset(cpu, *cpumask))
+			continue;
+
 		in = (unsigned long *)&per_cpu(page_states, cpu);
 
 		cpu = next_cpu(cpu, *cpumask);
 
-		if (cpu < NR_CPUS)
+		if (likely(cpu < NR_CPUS))
 			prefetch(&per_cpu(page_states, cpu));
 
 		out = (unsigned long *)ret;
@@ -1886,8 +1889,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
  * not check if the processor is online before following the pageset pointer.
  * Other parts of the kernel may not check if the zone is available.
  */
-static struct per_cpu_pageset
-	boot_pageset[NR_CPUS];
+static struct per_cpu_pageset boot_pageset[NR_CPUS];
 
 /*
  * Dynamically allocate memory for the
diff --git a/net/core/dev.c b/net/core/dev.c
index ffb82073056..2afb0de9532 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3237,7 +3237,7 @@ static int __init net_dev_init(void)
 	 *	Initialise the packet receive queues.
 	 */
 
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		struct softnet_data *queue;
 
 		queue = &per_cpu(softnet_data, i);
diff --git a/net/core/utils.c b/net/core/utils.c
index ac1d1fcf867..fdc4f38bc46 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -121,7 +121,7 @@ void __init net_random_init(void)
 {
 	int i;
 
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		struct nrnd_state *state = &per_cpu(net_rand_state,i);
 		__net_srandom(state, i+jiffies);
 	}
@@ -133,7 +133,7 @@ static int net_random_reseed(void)
 	unsigned long seed[NR_CPUS];
 
 	get_random_bytes(seed, sizeof(seed));
-	for (i = 0; i < NR_CPUS; i++) {
+	for_each_cpu(i) {
 		struct nrnd_state *state = &per_cpu(net_rand_state,i);
 		__net_srandom(state, seed[i]);
 	}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 39d49dc333a..1b167c4bb3b 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -49,7 +49,7 @@ static int fold_prot_inuse(struct proto *proto)
 	int res = 0;
 	int cpu;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for_each_cpu(cpu)
 		res += proto->stats[cpu].inuse;
 
 	return res;
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 50a13e75d70..4238b1ed886 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -38,7 +38,7 @@ static int fold_prot_inuse(struct proto *proto)
 	int res = 0;
 	int cpu;
 
-	for (cpu=0; cpu<NR_CPUS; cpu++)
+	for_each_cpu(cpu)
 		res += proto->stats[cpu].inuse;
 
 	return res;
diff --git a/net/socket.c b/net/socket.c
index b38a263853c..a00851f981d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2078,7 +2078,7 @@ void socket_seq_show(struct seq_file *seq)
 	int cpu;
 	int counter = 0;
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++)
+	for_each_cpu(cpu)
 		counter += per_cpu(sockets_in_use, cpu);
 
 	/* It can be negative, by the way. 8) */
-- 
cgit v1.2.3-70-g09d2


From bd576c9523fbf23e94fb7dbe05d2ae1cf96864e4 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <76306.1226@compuserve.com>
Date: Sat, 4 Feb 2006 23:27:42 -0800
Subject: [PATCH] sched: only print migration_cost once per boot

migration_cost prints after every CPU hotplug event.  Make it print only
once at boot.

Signed-off-by: Chuck Ebbert <76306.1226@compuserve.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 839466fdfb4..bc38804e40d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5551,13 +5551,15 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
 			-1
 #endif
 		);
-	printk("migration_cost=");
-	for (distance = 0; distance <= max_distance; distance++) {
-		if (distance)
-			printk(",");
-		printk("%ld", (long)migration_cost[distance] / 1000);
+	if (system_state == SYSTEM_BOOTING) {
+		printk("migration_cost=");
+		for (distance = 0; distance <= max_distance; distance++) {
+			if (distance)
+				printk(",");
+			printk("%ld", (long)migration_cost[distance] / 1000);
+		}
+		printk("\n");
 	}
-	printk("\n");
 	j1 = jiffies;
 	if (migration_debug)
 		printk("migration: %ld seconds\n", (j1-j0)/HZ);
-- 
cgit v1.2.3-70-g09d2


From 5c0d5d262aa4c5e93f9f5de298cf25d6d8b558c4 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Sat, 4 Feb 2006 23:27:49 -0800
Subject: [PATCH] missing license tag in intermodule

It may suck something awful, but it shouldn't taint the kernel.

Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/intermodule.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/intermodule.c b/kernel/intermodule.c
index 0cbe633420f..55b1e5b85db 100644
--- a/kernel/intermodule.c
+++ b/kernel/intermodule.c
@@ -179,3 +179,6 @@ EXPORT_SYMBOL(inter_module_register);
 EXPORT_SYMBOL(inter_module_unregister);
 EXPORT_SYMBOL(inter_module_get_request);
 EXPORT_SYMBOL(inter_module_put);
+
+MODULE_LICENSE("GPL");
+
-- 
cgit v1.2.3-70-g09d2


From 7714d5985bb7101a90fb427dc29dc592cf1b960e Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Tue, 7 Feb 2006 12:58:22 -0800
Subject: [PATCH] swsusp: kill unneeded/unbalanced bio_get

- Remove unneeded bio_get() which would cause a bio leak

- Writing doesn't dirty pages.  Reading dirties pages.

- We should dirty the pages after the IO completion, not before

(Busy-waiting for disk I/O completion isn't very polite.)

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 59c91c148e8..4e90905f0e8 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -743,7 +743,6 @@ static int submit(int rw, pgoff_t page_off, void *page)
 	if (!bio)
 		return -ENOMEM;
 	bio->bi_sector = page_off * (PAGE_SIZE >> 9);
-	bio_get(bio);
 	bio->bi_bdev = resume_bdev;
 	bio->bi_end_io = end_io;
 
@@ -753,14 +752,13 @@ static int submit(int rw, pgoff_t page_off, void *page)
 		goto Done;
 	}
 
-	if (rw == WRITE)
-		bio_set_pages_dirty(bio);
 
 	atomic_set(&io_done, 1);
 	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
 	while (atomic_read(&io_done))
 		yield();
-
+	if (rw == READ)
+		bio_set_pages_dirty(bio);
  Done:
 	bio_put(bio);
 	return error;
-- 
cgit v1.2.3-70-g09d2


From 8e08b756869eeb08ace17ad64c2a8cb97b18e856 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Tue, 7 Feb 2006 12:58:45 -0800
Subject: [PATCH] module: strlen_user() race fix

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/module.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index e058aedf6b9..5aad477ddc7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1670,6 +1670,9 @@ static struct module *load_module(void __user *umod,
 		goto free_mod;
 	}
 
+	/* Userspace could have altered the string after the strlen_user() */
+	args[arglen - 1] = '\0';
+
 	if (find_module(mod->name)) {
 		err = -EEXIST;
 		goto free_mod;
-- 
cgit v1.2.3-70-g09d2


From 46cd2f32baf181b74b16cceb123bab6fe1f61f85 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 7 Feb 2006 12:58:50 -0800
Subject: [PATCH] Fix build failure in recent pm_prepare_* changes.

Fix compilation problem in PM headers.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/suspend.h | 10 +++++++++-
 kernel/power/console.c  |  4 +++-
 kernel/power/power.h    | 16 ----------------
 3 files changed, 12 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 43bcd13eb1e..37c1c76fd54 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -42,13 +42,21 @@ extern void mark_free_pages(struct zone *zone);
 #ifdef CONFIG_PM
 /* kernel/power/swsusp.c */
 extern int software_suspend(void);
+
+#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
+extern int pm_prepare_console(void);
+extern void pm_restore_console(void);
+#else
+static inline int pm_prepare_console(void) { return 0; }
+static inline void pm_restore_console(void) {}
+#endif /* defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) */
 #else
 static inline int software_suspend(void)
 {
 	printk("Warning: fake suspend called\n");
 	return -EPERM;
 }
-#endif
+#endif /* CONFIG_PM */
 
 #ifdef CONFIG_SUSPEND_SMP
 extern void disable_nonboot_cpus(void);
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 579d239d129..623786d4415 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -9,7 +9,9 @@
 #include <linux/console.h>
 #include "power.h"
 
-#ifdef SUSPEND_CONSOLE
+#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
+#define SUSPEND_CONSOLE	(MAX_NR_CONSOLES-1)
+
 static int orig_fgconsole, orig_kmsg;
 
 int pm_prepare_console(void)
diff --git a/kernel/power/power.h b/kernel/power/power.h
index d8f0d1a76ba..388dba68084 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -1,14 +1,6 @@
 #include <linux/suspend.h>
 #include <linux/utsname.h>
 
-/* With SUSPEND_CONSOLE defined suspend looks *really* cool, but
-   we probably do not take enough locks for switching consoles, etc,
-   so bad things might happen.
-*/
-#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
-#define SUSPEND_CONSOLE	(MAX_NR_CONSOLES-1)
-#endif
-
 struct swsusp_info {
 	struct new_utsname	uts;
 	u32			version_code;
@@ -42,14 +34,6 @@ static struct subsys_attribute _name##_attr = {	\
 
 extern struct subsystem power_subsys;
 
-#ifdef SUSPEND_CONSOLE
-extern int pm_prepare_console(void);
-extern void pm_restore_console(void);
-#else
-static int pm_prepare_console(void) { return 0; }
-static void pm_restore_console(void) {}
-#endif
-
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
 
-- 
cgit v1.2.3-70-g09d2


From cf2e340f4249b781b3d2beb41e891d08581f0e10 Mon Sep 17 00:00:00 2001
From: JANAK DESAI <janak@us.ibm.com>
Date: Tue, 7 Feb 2006 12:58:58 -0800
Subject: [PATCH] unshare system call -v5: system call handler function

sys_unshare system call handler function accepts the same flags as clone
system call, checks constraints on each of the flags and invokes corresponding
unshare functions to disassociate respective process context if it was being
shared with another task.

Here is the link to a program for testing unshare system call.

http://prdownloads.sourceforge.net/audit/unshare_test.c?download

Please note that because of a problem in rmdir associated with bind mounts and
clone with CLONE_NEWNS, the test fails while trying to remove temporary test
directory.  You can remove that temporary directory by doing rmdir, twice,
from the command line.  The first will fail with EBUSY, but the second will
succeed.  I have reported the problem to Ram Pai and Al Viro with a small
program which reproduces the problem.  Al told us yesterday that he will be
looking at the problem soon.  I have tried multiple rmdirs from the
unshare_test program itself, but for some reason that is not working.  Doing
two rmdirs from command line does seem to remove the directory.

Signed-off-by: Janak Desai <janak@us.ibm.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Andi Kleen <ak@muc.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 232 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 7f0ab5ee948..6eb9362775f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1323,3 +1323,235 @@ void __init proc_caches_init(void)
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 }
+
+
+/*
+ * Check constraints on flags passed to the unshare system call and
+ * force unsharing of additional process context as appropriate.
+ */
+static inline void check_unshare_flags(unsigned long *flags_ptr)
+{
+	/*
+	 * If unsharing a thread from a thread group, must also
+	 * unshare vm.
+	 */
+	if (*flags_ptr & CLONE_THREAD)
+		*flags_ptr |= CLONE_VM;
+
+	/*
+	 * If unsharing vm, must also unshare signal handlers.
+	 */
+	if (*flags_ptr & CLONE_VM)
+		*flags_ptr |= CLONE_SIGHAND;
+
+	/*
+	 * If unsharing signal handlers and the task was created
+	 * using CLONE_THREAD, then must unshare the thread
+	 */
+	if ((*flags_ptr & CLONE_SIGHAND) &&
+	    (atomic_read(&current->signal->count) > 1))
+		*flags_ptr |= CLONE_THREAD;
+
+	/*
+	 * If unsharing namespace, must also unshare filesystem information.
+	 */
+	if (*flags_ptr & CLONE_NEWNS)
+		*flags_ptr |= CLONE_FS;
+}
+
+/*
+ * Unsharing of tasks created with CLONE_THREAD is not supported yet
+ */
+static int unshare_thread(unsigned long unshare_flags)
+{
+	if (unshare_flags & CLONE_THREAD)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Unsharing of fs info for tasks created with CLONE_FS is not supported yet
+ */
+static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
+{
+	struct fs_struct *fs = current->fs;
+
+	if ((unshare_flags & CLONE_FS) &&
+	    (fs && atomic_read(&fs->count) > 1))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Unsharing of namespace for tasks created without CLONE_NEWNS is not
+ * supported yet
+ */
+static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp)
+{
+	struct namespace *ns = current->namespace;
+
+	if ((unshare_flags & CLONE_NEWNS) &&
+	    (ns && atomic_read(&ns->count) > 1))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Unsharing of sighand for tasks created with CLONE_SIGHAND is not
+ * supported yet
+ */
+static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
+{
+	struct sighand_struct *sigh = current->sighand;
+
+	if ((unshare_flags & CLONE_SIGHAND) &&
+	    (sigh && atomic_read(&sigh->count) > 1))
+		return -EINVAL;
+	else
+		return 0;
+}
+
+/*
+ * Unsharing of vm for tasks created with CLONE_VM is not supported yet
+ */
+static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
+{
+	struct mm_struct *mm = current->mm;
+
+	if ((unshare_flags & CLONE_VM) &&
+	    (mm && atomic_read(&mm->mm_users) > 1))
+		return -EINVAL;
+
+	return 0;
+
+}
+
+/*
+ * Unsharing of files for tasks created with CLONE_FILES is not supported yet
+ */
+static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
+{
+	struct files_struct *fd = current->files;
+
+	if ((unshare_flags & CLONE_FILES) &&
+	    (fd && atomic_read(&fd->count) > 1))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Unsharing of semundo for tasks created with CLONE_SYSVSEM is not
+ * supported yet
+ */
+static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **new_ulistp)
+{
+	if (unshare_flags & CLONE_SYSVSEM)
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * unshare allows a process to 'unshare' part of the process
+ * context which was originally shared using clone.  copy_*
+ * functions used by do_fork() cannot be used here directly
+ * because they modify an inactive task_struct that is being
+ * constructed. Here we are modifying the current, active,
+ * task_struct.
+ */
+asmlinkage long sys_unshare(unsigned long unshare_flags)
+{
+	int err = 0;
+	struct fs_struct *fs, *new_fs = NULL;
+	struct namespace *ns, *new_ns = NULL;
+	struct sighand_struct *sigh, *new_sigh = NULL;
+	struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
+	struct files_struct *fd, *new_fd = NULL;
+	struct sem_undo_list *new_ulist = NULL;
+
+	check_unshare_flags(&unshare_flags);
+
+	if ((err = unshare_thread(unshare_flags)))
+		goto bad_unshare_out;
+	if ((err = unshare_fs(unshare_flags, &new_fs)))
+		goto bad_unshare_cleanup_thread;
+	if ((err = unshare_namespace(unshare_flags, &new_ns)))
+		goto bad_unshare_cleanup_fs;
+	if ((err = unshare_sighand(unshare_flags, &new_sigh)))
+		goto bad_unshare_cleanup_ns;
+	if ((err = unshare_vm(unshare_flags, &new_mm)))
+		goto bad_unshare_cleanup_sigh;
+	if ((err = unshare_fd(unshare_flags, &new_fd)))
+		goto bad_unshare_cleanup_vm;
+	if ((err = unshare_semundo(unshare_flags, &new_ulist)))
+		goto bad_unshare_cleanup_fd;
+
+	if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) {
+
+		task_lock(current);
+
+		if (new_fs) {
+			fs = current->fs;
+			current->fs = new_fs;
+			new_fs = fs;
+		}
+
+		if (new_ns) {
+			ns = current->namespace;
+			current->namespace = new_ns;
+			new_ns = ns;
+		}
+
+		if (new_sigh) {
+			sigh = current->sighand;
+			current->sighand = new_sigh;
+			new_sigh = sigh;
+		}
+
+		if (new_mm) {
+			mm = current->mm;
+			active_mm = current->active_mm;
+			current->mm = new_mm;
+			current->active_mm = new_mm;
+			activate_mm(active_mm, new_mm);
+			new_mm = mm;
+		}
+
+		if (new_fd) {
+			fd = current->files;
+			current->files = new_fd;
+			new_fd = fd;
+		}
+
+		task_unlock(current);
+	}
+
+bad_unshare_cleanup_fd:
+	if (new_fd)
+		put_files_struct(new_fd);
+
+bad_unshare_cleanup_vm:
+	if (new_mm)
+		mmput(new_mm);
+
+bad_unshare_cleanup_sigh:
+	if (new_sigh)
+		if (atomic_dec_and_test(&new_sigh->count))
+			kmem_cache_free(sighand_cachep, new_sigh);
+
+bad_unshare_cleanup_ns:
+	if (new_ns)
+		put_namespace(new_ns);
+
+bad_unshare_cleanup_fs:
+	if (new_fs)
+		put_fs_struct(new_fs);
+
+bad_unshare_cleanup_thread:
+bad_unshare_out:
+	return err;
+}
-- 
cgit v1.2.3-70-g09d2


From 99d1419d96d7df9cfa56bc977810be831bd5ef64 Mon Sep 17 00:00:00 2001
From: JANAK DESAI <janak@us.ibm.com>
Date: Tue, 7 Feb 2006 12:58:59 -0800
Subject: [PATCH] unshare system call -v5: unshare filesystem info

If filesystem structure is being shared, allocate a new one and copy
information from the current, shared, structure.

Signed-off-by: Janak Desai <janak@us.ibm.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Andi Kleen <ak@muc.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 6eb9362775f..598e5c27242 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1371,15 +1371,18 @@ static int unshare_thread(unsigned long unshare_flags)
 }
 
 /*
- * Unsharing of fs info for tasks created with CLONE_FS is not supported yet
+ * Unshare the filesystem structure if it is being shared
  */
 static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 {
 	struct fs_struct *fs = current->fs;
 
 	if ((unshare_flags & CLONE_FS) &&
-	    (fs && atomic_read(&fs->count) > 1))
-		return -EINVAL;
+	    (fs && atomic_read(&fs->count) > 1)) {
+		*new_fsp = __copy_fs_struct(current->fs);
+		if (!*new_fsp)
+			return -ENOMEM;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 741a295130606143edbf9fc740f633dbc1e6225f Mon Sep 17 00:00:00 2001
From: JANAK DESAI <janak@us.ibm.com>
Date: Tue, 7 Feb 2006 12:59:00 -0800
Subject: [PATCH] unshare system call -v5: unshare namespace

If the namespace structure is being shared, allocate a new one and copy
information from the current, shared, structure.

Signed-off-by: Janak Desai <janak@us.ibm.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Andi Kleen <ak@muc.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/namespace.c            | 56 ++++++++++++++++++++++++++++++-----------------
 include/linux/namespace.h |  1 +
 kernel/fork.c             | 17 +++++++++-----
 3 files changed, 48 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/fs/namespace.c b/fs/namespace.c
index ce97becff46..a2bef5c8103 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1325,27 +1325,17 @@ dput_out:
 	return retval;
 }
 
-int copy_namespace(int flags, struct task_struct *tsk)
+/*
+ * Allocate a new namespace structure and populate it with contents
+ * copied from the namespace of the passed in task structure.
+ */
+struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs)
 {
 	struct namespace *namespace = tsk->namespace;
 	struct namespace *new_ns;
 	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL;
-	struct fs_struct *fs = tsk->fs;
 	struct vfsmount *p, *q;
 
-	if (!namespace)
-		return 0;
-
-	get_namespace(namespace);
-
-	if (!(flags & CLONE_NEWNS))
-		return 0;
-
-	if (!capable(CAP_SYS_ADMIN)) {
-		put_namespace(namespace);
-		return -EPERM;
-	}
-
 	new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL);
 	if (!new_ns)
 		goto out;
@@ -1396,8 +1386,6 @@ int copy_namespace(int flags, struct task_struct *tsk)
 	}
 	up_write(&namespace_sem);
 
-	tsk->namespace = new_ns;
-
 	if (rootmnt)
 		mntput(rootmnt);
 	if (pwdmnt)
@@ -1405,12 +1393,40 @@ int copy_namespace(int flags, struct task_struct *tsk)
 	if (altrootmnt)
 		mntput(altrootmnt);
 
-	put_namespace(namespace);
-	return 0;
+out:
+	return new_ns;
+}
+
+int copy_namespace(int flags, struct task_struct *tsk)
+{
+	struct namespace *namespace = tsk->namespace;
+	struct namespace *new_ns;
+	int err = 0;
+
+	if (!namespace)
+		return 0;
+
+	get_namespace(namespace);
+
+	if (!(flags & CLONE_NEWNS))
+		return 0;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	new_ns = dup_namespace(tsk, tsk->fs);
+	if (!new_ns) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	tsk->namespace = new_ns;
 
 out:
 	put_namespace(namespace);
-	return -ENOMEM;
+	return err;
 }
 
 asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name,
diff --git a/include/linux/namespace.h b/include/linux/namespace.h
index 6731977c4c1..3abc8e3b487 100644
--- a/include/linux/namespace.h
+++ b/include/linux/namespace.h
@@ -15,6 +15,7 @@ struct namespace {
 
 extern int copy_namespace(int, struct task_struct *);
 extern void __put_namespace(struct namespace *namespace);
+extern struct namespace *dup_namespace(struct task_struct *, struct fs_struct *);
 
 static inline void put_namespace(struct namespace *namespace)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 598e5c27242..07dd241aa1e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1388,16 +1388,21 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 }
 
 /*
- * Unsharing of namespace for tasks created without CLONE_NEWNS is not
- * supported yet
+ * Unshare the namespace structure if it is being shared
  */
-static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp)
+static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs)
 {
 	struct namespace *ns = current->namespace;
 
 	if ((unshare_flags & CLONE_NEWNS) &&
-	    (ns && atomic_read(&ns->count) > 1))
-		return -EINVAL;
+	    (ns && atomic_read(&ns->count) > 1)) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		*new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs);
+		if (!*new_nsp)
+			return -ENOMEM;
+	}
 
 	return 0;
 }
@@ -1482,7 +1487,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags)
 		goto bad_unshare_out;
 	if ((err = unshare_fs(unshare_flags, &new_fs)))
 		goto bad_unshare_cleanup_thread;
-	if ((err = unshare_namespace(unshare_flags, &new_ns)))
+	if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs)))
 		goto bad_unshare_cleanup_fs;
 	if ((err = unshare_sighand(unshare_flags, &new_sigh)))
 		goto bad_unshare_cleanup_ns;
-- 
cgit v1.2.3-70-g09d2


From a0a7ec308f1be5957b20a1a535d21f683dfd83f0 Mon Sep 17 00:00:00 2001
From: JANAK DESAI <janak@us.ibm.com>
Date: Tue, 7 Feb 2006 12:59:01 -0800
Subject: [PATCH] unshare system call -v5: unshare vm

If vm structure is being shared, allocate a new one and copy information from
the current, shared, structure.

Signed-off-by: Janak Desai <janak@us.ibm.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Andi Kleen <ak@muc.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 87 ++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 56 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 07dd241aa1e..d1aceaea3f3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -446,6 +446,55 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 	}
 }
 
+/*
+ * Allocate a new mm structure and copy contents from the
+ * mm structure of the passed in task structure.
+ */
+static struct mm_struct *dup_mm(struct task_struct *tsk)
+{
+	struct mm_struct *mm, *oldmm = current->mm;
+	int err;
+
+	if (!oldmm)
+		return NULL;
+
+	mm = allocate_mm();
+	if (!mm)
+		goto fail_nomem;
+
+	memcpy(mm, oldmm, sizeof(*mm));
+
+	if (!mm_init(mm))
+		goto fail_nomem;
+
+	if (init_new_context(tsk, mm))
+		goto fail_nocontext;
+
+	err = dup_mmap(mm, oldmm);
+	if (err)
+		goto free_pt;
+
+	mm->hiwater_rss = get_mm_rss(mm);
+	mm->hiwater_vm = mm->total_vm;
+
+	return mm;
+
+free_pt:
+	mmput(mm);
+
+fail_nomem:
+	return NULL;
+
+fail_nocontext:
+	/*
+	 * If init_new_context() failed, we cannot use mmput() to free the mm
+	 * because it calls destroy_context()
+	 */
+	mm_free_pgd(mm);
+	free_mm(mm);
+	return NULL;
+}
+
 static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct mm_struct * mm, *oldmm;
@@ -473,43 +522,17 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 	}
 
 	retval = -ENOMEM;
-	mm = allocate_mm();
+	mm = dup_mm(tsk);
 	if (!mm)
 		goto fail_nomem;
 
-	/* Copy the current MM stuff.. */
-	memcpy(mm, oldmm, sizeof(*mm));
-	if (!mm_init(mm))
-		goto fail_nomem;
-
-	if (init_new_context(tsk,mm))
-		goto fail_nocontext;
-
-	retval = dup_mmap(mm, oldmm);
-	if (retval)
-		goto free_pt;
-
-	mm->hiwater_rss = get_mm_rss(mm);
-	mm->hiwater_vm = mm->total_vm;
-
 good_mm:
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	return 0;
 
-free_pt:
-	mmput(mm);
 fail_nomem:
 	return retval;
-
-fail_nocontext:
-	/*
-	 * If init_new_context() failed, we cannot use mmput() to free the mm
-	 * because it calls destroy_context()
-	 */
-	mm_free_pgd(mm);
-	free_mm(mm);
-	return retval;
 }
 
 static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
@@ -1423,18 +1446,20 @@ static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **
 }
 
 /*
- * Unsharing of vm for tasks created with CLONE_VM is not supported yet
+ * Unshare vm if it is being shared
  */
 static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
 {
 	struct mm_struct *mm = current->mm;
 
 	if ((unshare_flags & CLONE_VM) &&
-	    (mm && atomic_read(&mm->mm_users) > 1))
-		return -EINVAL;
+	    (mm && atomic_read(&mm->mm_users) > 1)) {
+		*new_mmp = dup_mm(current);
+		if (!*new_mmp)
+			return -ENOMEM;
+	}
 
 	return 0;
-
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From a016f3389c06606dd80e687942ff3c71d41823c4 Mon Sep 17 00:00:00 2001
From: JANAK DESAI <janak@us.ibm.com>
Date: Tue, 7 Feb 2006 12:59:02 -0800
Subject: [PATCH] unshare system call -v5: unshare files

If the file descriptor structure is being shared, allocate a new one and copy
information from the current, shared, structure.

Signed-off-by: Janak Desai <janak@us.ibm.com>
Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Andi Kleen <ak@muc.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 81 +++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 51 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index d1aceaea3f3..8e88b374cee 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -620,32 +620,17 @@ out:
 	return newf;
 }
 
-static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
+/*
+ * Allocate a new files structure and copy contents from the
+ * passed in files structure.
+ */
+static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 {
-	struct files_struct *oldf, *newf;
+	struct files_struct *newf;
 	struct file **old_fds, **new_fds;
-	int open_files, size, i, error = 0, expand;
+	int open_files, size, i, expand;
 	struct fdtable *old_fdt, *new_fdt;
 
-	/*
-	 * A background process may not have any files ...
-	 */
-	oldf = current->files;
-	if (!oldf)
-		goto out;
-
-	if (clone_flags & CLONE_FILES) {
-		atomic_inc(&oldf->count);
-		goto out;
-	}
-
-	/*
-	 * Note: we may be using current for both targets (See exec.c)
-	 * This works because we cache current->files (old) as oldf. Don't
-	 * break this.
-	 */
-	tsk->files = NULL;
-	error = -ENOMEM;
 	newf = alloc_files();
 	if (!newf)
 		goto out;
@@ -674,9 +659,9 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 	if (expand) {
 		spin_unlock(&oldf->file_lock);
 		spin_lock(&newf->file_lock);
-		error = expand_files(newf, open_files-1);
+		*errorp = expand_files(newf, open_files-1);
 		spin_unlock(&newf->file_lock);
-		if (error < 0)
+		if (*errorp < 0)
 			goto out_release;
 		new_fdt = files_fdtable(newf);
 		/*
@@ -725,10 +710,8 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 		memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
 	}
 
-	tsk->files = newf;
-	error = 0;
 out:
-	return error;
+	return newf;
 
 out_release:
 	free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
@@ -738,6 +721,40 @@ out_release:
 	goto out;
 }
 
+static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
+{
+	struct files_struct *oldf, *newf;
+	int error = 0;
+
+	/*
+	 * A background process may not have any files ...
+	 */
+	oldf = current->files;
+	if (!oldf)
+		goto out;
+
+	if (clone_flags & CLONE_FILES) {
+		atomic_inc(&oldf->count);
+		goto out;
+	}
+
+	/*
+	 * Note: we may be using current for both targets (See exec.c)
+	 * This works because we cache current->files (old) as oldf. Don't
+	 * break this.
+	 */
+	tsk->files = NULL;
+	error = -ENOMEM;
+	newf = dup_fd(oldf, &error);
+	if (!newf)
+		goto out;
+
+	tsk->files = newf;
+	error = 0;
+out:
+	return error;
+}
+
 /*
  *	Helper to unshare the files of the current task.
  *	We don't want to expose copy_files internals to
@@ -1463,15 +1480,19 @@ static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
 }
 
 /*
- * Unsharing of files for tasks created with CLONE_FILES is not supported yet
+ * Unshare file descriptor table if it is being shared
  */
 static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
 {
 	struct files_struct *fd = current->files;
+	int error = 0;
 
 	if ((unshare_flags & CLONE_FILES) &&
-	    (fd && atomic_read(&fd->count) > 1))
-		return -EINVAL;
+	    (fd && atomic_read(&fd->count) > 1)) {
+		*new_fdp = dup_fd(fd, &error);
+		if (!*new_fdp)
+			return error;
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3-70-g09d2


From 1b8623545b42c03eb92e51b28c84acf4b8ba00a3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 15 Dec 2005 01:07:03 -0500
Subject: [PATCH] remove bogus asm/bug.h includes.

A bunch of asm/bug.h includes are both not needed (since it will get
pulled anyway) and bogus (since they are done too early).  Removed.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 crypto/scatterwalk.c                | 1 -
 drivers/cdrom/viocd.c               | 2 --
 drivers/net/hamradio/baycom_par.c   | 1 -
 drivers/tc/tc.c                     | 1 -
 drivers/video/backlight/backlight.c | 1 -
 drivers/video/backlight/lcd.c       | 1 -
 drivers/video/pmag-ba-fb.c          | 1 -
 drivers/video/pmagb-b-fb.c          | 1 -
 fs/reiserfs/hashes.c                | 1 -
 include/asm-mips/io.h               | 1 -
 include/asm-powerpc/dma-mapping.h   | 1 -
 include/linux/cpumask.h             | 1 -
 include/linux/dcache.h              | 1 -
 include/linux/jbd.h                 | 1 -
 include/linux/mtd/map.h             | 1 -
 include/linux/nodemask.h            | 1 -
 include/linux/smp.h                 | 1 -
 kernel/compat.c                     | 1 -
 net/dccp/ccids/lib/tfrc_equation.c  | 1 -
 net/ipv4/xfrm4_policy.c             | 1 -
 net/ipv6/raw.c                      | 1 -
 net/ipv6/xfrm6_policy.c             | 1 -
 net/xfrm/xfrm_policy.c              | 1 -
 23 files changed, 24 deletions(-)

(limited to 'kernel')

diff --git a/crypto/scatterwalk.c b/crypto/scatterwalk.c
index 47ac90e615f..2953e2cc56f 100644
--- a/crypto/scatterwalk.c
+++ b/crypto/scatterwalk.c
@@ -17,7 +17,6 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
-#include <asm/bug.h>
 #include <asm/scatterlist.h>
 #include "internal.h"
 #include "scatterwalk.h"
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index 193446e6a08..e2761725955 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -42,8 +42,6 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 
-#include <asm/bug.h>
-
 #include <asm/vio.h>
 #include <asm/scatterlist.h>
 #include <asm/iseries/hv_types.h>
diff --git a/drivers/net/hamradio/baycom_par.c b/drivers/net/hamradio/baycom_par.c
index 3b1bef1ee21..77411a00d1e 100644
--- a/drivers/net/hamradio/baycom_par.c
+++ b/drivers/net/hamradio/baycom_par.c
@@ -86,7 +86,6 @@
 #include <linux/bitops.h>
 #include <linux/jiffies.h>
 
-#include <asm/bug.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
diff --git a/drivers/tc/tc.c b/drivers/tc/tc.c
index a0e5af638e0..4a51e56f85b 100644
--- a/drivers/tc/tc.c
+++ b/drivers/tc/tc.c
@@ -17,7 +17,6 @@
 #include <linux/types.h>
 
 #include <asm/addrspace.h>
-#include <asm/bug.h>
 #include <asm/errno.h>
 #include <asm/io.h>
 #include <asm/paccess.h>
diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c
index 9d5015e9937..bd39bbd88d4 100644
--- a/drivers/video/backlight/backlight.c
+++ b/drivers/video/backlight/backlight.c
@@ -13,7 +13,6 @@
 #include <linux/ctype.h>
 #include <linux/err.h>
 #include <linux/fb.h>
-#include <asm/bug.h>
 
 static ssize_t backlight_show_power(struct class_device *cdev, char *buf)
 {
diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c
index 68c690605aa..9e32485ee7b 100644
--- a/drivers/video/backlight/lcd.c
+++ b/drivers/video/backlight/lcd.c
@@ -13,7 +13,6 @@
 #include <linux/ctype.h>
 #include <linux/err.h>
 #include <linux/fb.h>
-#include <asm/bug.h>
 
 static ssize_t lcd_show_power(struct class_device *cdev, char *buf)
 {
diff --git a/drivers/video/pmag-ba-fb.c b/drivers/video/pmag-ba-fb.c
index f3927b6cda9..f5361cd8ccc 100644
--- a/drivers/video/pmag-ba-fb.c
+++ b/drivers/video/pmag-ba-fb.c
@@ -30,7 +30,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 
-#include <asm/bug.h>
 #include <asm/io.h>
 #include <asm/system.h>
 
diff --git a/drivers/video/pmagb-b-fb.c b/drivers/video/pmagb-b-fb.c
index 25148de5fe6..eeeac924b50 100644
--- a/drivers/video/pmagb-b-fb.c
+++ b/drivers/video/pmagb-b-fb.c
@@ -27,7 +27,6 @@
 #include <linux/module.h>
 #include <linux/types.h>
 
-#include <asm/bug.h>
 #include <asm/io.h>
 #include <asm/system.h>
 
diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c
index a3ec238fd9e..e664ac16fad 100644
--- a/fs/reiserfs/hashes.c
+++ b/fs/reiserfs/hashes.c
@@ -21,7 +21,6 @@
 #include <linux/kernel.h>
 #include <linux/reiserfs_fs.h>
 #include <asm/types.h>
-#include <asm/bug.h>
 
 #define DELTA 0x9E3779B9
 #define FULLROUNDS 10		/* 32 is overkill, 16 is strong crypto */
diff --git a/include/asm-mips/io.h b/include/asm-mips/io.h
index d42685747e7..a9fa1254894 100644
--- a/include/asm-mips/io.h
+++ b/include/asm-mips/io.h
@@ -18,7 +18,6 @@
 #include <linux/types.h>
 
 #include <asm/addrspace.h>
-#include <asm/bug.h>
 #include <asm/byteorder.h>
 #include <asm/cpu.h>
 #include <asm/cpu-features.h>
diff --git a/include/asm-powerpc/dma-mapping.h b/include/asm-powerpc/dma-mapping.h
index 837756ab7dc..2ac63f56959 100644
--- a/include/asm-powerpc/dma-mapping.h
+++ b/include/asm-powerpc/dma-mapping.h
@@ -15,7 +15,6 @@
 #include <linux/mm.h>
 #include <asm/scatterlist.h>
 #include <asm/io.h>
-#include <asm/bug.h>
 
 #define DMA_ERROR_CODE		(~(dma_addr_t)0x0)
 
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 13e9f4a3ab2..20b446f26ec 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -84,7 +84,6 @@
 #include <linux/kernel.h>
 #include <linux/threads.h>
 #include <linux/bitmap.h>
-#include <asm/bug.h>
 
 typedef struct { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;
 extern cpumask_t _unused_cpumask_arg_;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index a3f09947940..4361f378997 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -8,7 +8,6 @@
 #include <linux/spinlock.h>
 #include <linux/cache.h>
 #include <linux/rcupdate.h>
-#include <asm/bug.h>
 
 struct nameidata;
 struct vfsmount;
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 751bb384946..0fe4aa891dd 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -239,7 +239,6 @@ typedef struct journal_superblock_s
 
 #include <linux/fs.h>
 #include <linux/sched.h>
-#include <asm/bug.h>
 
 #define JBD_ASSERTIONS
 #ifdef JBD_ASSERTIONS
diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index fedfbc8a287..7dfd6e1fcde 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -15,7 +15,6 @@
 #include <asm/unaligned.h>
 #include <asm/system.h>
 #include <asm/io.h>
-#include <asm/bug.h>
 
 #ifdef CONFIG_MTD_MAP_BANK_WIDTH_1
 #define map_bankwidth(map) 1
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 4726ef7ba8e..b959a4525cb 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -84,7 +84,6 @@
 #include <linux/threads.h>
 #include <linux/bitmap.h>
 #include <linux/numa.h>
-#include <asm/bug.h>
 
 typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;
 extern nodemask_t _unused_nodemask_arg_;
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 9dfa3ee769a..44153fdf73f 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -17,7 +17,6 @@ extern void cpu_idle(void);
 #include <linux/compiler.h>
 #include <linux/thread_info.h>
 #include <asm/smp.h>
-#include <asm/bug.h>
 
 /*
  * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
diff --git a/kernel/compat.c b/kernel/compat.c
index 1867290c37e..8c9cd88b678 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -23,7 +23,6 @@
 #include <linux/security.h>
 
 #include <asm/uaccess.h>
-#include <asm/bug.h>
 
 int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
 {
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index d2b5933b451..add3cae65e2 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -15,7 +15,6 @@
 #include <linux/config.h>
 #include <linux/module.h>
 
-#include <asm/bug.h>
 #include <asm/div64.h>
 
 #include "tfrc.h"
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 42196ba3b0b..45f7ae58f2c 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -8,7 +8,6 @@
  * 	
  */
 
-#include <asm/bug.h>
 #include <linux/compiler.h>
 #include <linux/config.h>
 #include <linux/inetdevice.h>
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 66f1d12ea57..738376cf0c5 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -35,7 +35,6 @@
 #include <linux/skbuff.h>
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
-#include <asm/bug.h>
 
 #include <net/ip.h>
 #include <net/sock.h>
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 69bd957380e..91cce8b2d7a 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -11,7 +11,6 @@
  * 
  */
 
-#include <asm/bug.h>
 #include <linux/compiler.h>
 #include <linux/config.h>
 #include <linux/netdevice.h>
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 077bbf9fb9b..dbf4620768d 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -13,7 +13,6 @@
  *
  */
 
-#include <asm/bug.h>
 #include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/kmod.h>
-- 
cgit v1.2.3-70-g09d2


From 53f087febfd12e74ba9f1082e71e9a45adc039ad Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 1 Feb 2006 05:56:41 -0500
Subject: [PATCH] timer.c NULL noise removal

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 4f1cb0ab525..b9dad399467 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -495,7 +495,7 @@ unsigned long next_timer_interrupt(void)
 	base = &__get_cpu_var(tvec_bases);
 	spin_lock(&base->t_base.lock);
 	expires = base->timer_jiffies + (LONG_MAX >> 1);
-	list = 0;
+	list = NULL;
 
 	/* Look for timer events in tv1. */
 	j = base->timer_jiffies & TVR_MASK;
-- 
cgit v1.2.3-70-g09d2


From 4bb8089c86b95b4f6bbd839cb83ca4556b06a031 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 1 Feb 2006 05:57:32 -0500
Subject: [PATCH] kernel/sys.c NULL noise removal

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/sys.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index 0929c698aff..f91218a5463 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -428,7 +428,7 @@ void kernel_kexec(void)
 {
 #ifdef CONFIG_KEXEC
 	struct kimage *image;
-	image = xchg(&kexec_image, 0);
+	image = xchg(&kexec_image, NULL);
 	if (!image) {
 		return;
 	}
-- 
cgit v1.2.3-70-g09d2


From c70d3d703ad94727dab2a3664aeee33d71e00715 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 9 Feb 2006 22:41:41 +0300
Subject: [PATCH] sys_signal: initialize ->sa_mask

Pointed out by Linus Torvalds.

sys_signal() forgets to initialize ->sa_mask.

( I suspect arch/ia64/ia32/ia32_signal.c:sys32_signal()
  also needs this fix )

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index b373fc2420d..01a1e7f7acf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2702,6 +2702,7 @@ sys_signal(int sig, __sighandler_t handler)
 
 	new_sa.sa.sa_handler = handler;
 	new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK;
+	sigemptyset(&new_sa.sa.sa_mask);
 
 	ret = do_sigaction(sig, &new_sa, &old_sa);
 
-- 
cgit v1.2.3-70-g09d2


From 9ac95f2f90e022c16d293d7978faddf7e779a1a9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 9 Feb 2006 22:41:50 +0300
Subject: [PATCH] do_sigaction: cleanup ->sa_mask manipulation

Clear unblockable signals beforehand.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 2 +-
 kernel/signal.c       | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0cfcd1c7865..9c1da0269a1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1098,7 +1098,7 @@ extern struct sigqueue *sigqueue_alloc(void);
 extern void sigqueue_free(struct sigqueue *);
 extern int send_sigqueue(int, struct sigqueue *,  struct task_struct *);
 extern int send_group_sigqueue(int, struct sigqueue *,  struct task_struct *);
-extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
+extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
 extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
 
 /* These can be the second arg to send_sig_info/send_group_sig_info.  */
diff --git a/kernel/signal.c b/kernel/signal.c
index 01a1e7f7acf..ea154104a00 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2430,7 +2430,7 @@ sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo)
 }
 
 int
-do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
+do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
 	struct k_sigaction *k;
 	sigset_t mask;
@@ -2454,6 +2454,8 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
 		*oact = *k;
 
 	if (act) {
+		sigdelsetmask(&act->sa.sa_mask,
+			      sigmask(SIGKILL) | sigmask(SIGSTOP));
 		/*
 		 * POSIX 3.3.1.3:
 		 *  "Setting a signal action to SIG_IGN for a signal that is
@@ -2479,8 +2481,6 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
 			read_lock(&tasklist_lock);
 			spin_lock_irq(&t->sighand->siglock);
 			*k = *act;
-			sigdelsetmask(&k->sa.sa_mask,
-				      sigmask(SIGKILL) | sigmask(SIGSTOP));
 			sigemptyset(&mask);
 			sigaddset(&mask, sig);
 			rm_from_queue_full(&mask, &t->signal->shared_pending);
@@ -2495,8 +2495,6 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact)
 		}
 
 		*k = *act;
-		sigdelsetmask(&k->sa.sa_mask,
-			      sigmask(SIGKILL) | sigmask(SIGSTOP));
 	}
 
 	spin_unlock_irq(&current->sighand->siglock);
-- 
cgit v1.2.3-70-g09d2


From a2000572ad511f5f43091ed7bd2cc3b913104a1e Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Fri, 10 Feb 2006 01:51:02 -0800
Subject: [PATCH] sched: remove smpnice

I don't think the code is quite ready, which is why I asked for Peter's
additions to also be merged before I acked it (although it turned out that
it still isn't quite ready with his additions either).

Basically I have had similar observations to Suresh in that it does not
play nicely with the rest of the balancing infrastructure (and raised
similar concerns in my review).

The samples (group of 4) I got for "maximum recorded imbalance" on a 2x2
SMP+HT Xeon are as follows:

            | Following boot | hackbench 20        | hackbench 40
 -----------+----------------+---------------------+---------------------
 2.6.16-rc2 | 30,37,100,112  | 5600,5530,6020,6090 | 6390,7090,8760,8470
 +nosmpnice |  3, 2,  4,  2  |   28, 150, 294, 132 |  348, 348, 294, 347

Hackbench raw performance is down around 15% with smpnice (but that in
itself isn't a huge deal because it is just a benchmark).  However, the
samples show that the imbalance passed into move_tasks is increased by
about a factor of 10-30.  I think this would also go some way to explaining
latency blips turning up in the balancing code (though I haven't actually
measured that).

We'll probably have to revert this in the SUSE kernel.

Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Peter Williams <pwil3058@bigpond.net.au>
Cc: "Martin J. Bligh" <mbligh@aracnet.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 129 ++++++++-------------------------------------------------
 1 file changed, 18 insertions(+), 111 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index bc38804e40d..87d93be336a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -215,7 +215,6 @@ struct runqueue {
 	 */
 	unsigned long nr_running;
 #ifdef CONFIG_SMP
-	unsigned long prio_bias;
 	unsigned long cpu_load[3];
 #endif
 	unsigned long long nr_switches;
@@ -669,68 +668,13 @@ static int effective_prio(task_t *p)
 	return prio;
 }
 
-#ifdef CONFIG_SMP
-static inline void inc_prio_bias(runqueue_t *rq, int prio)
-{
-	rq->prio_bias += MAX_PRIO - prio;
-}
-
-static inline void dec_prio_bias(runqueue_t *rq, int prio)
-{
-	rq->prio_bias -= MAX_PRIO - prio;
-}
-
-static inline void inc_nr_running(task_t *p, runqueue_t *rq)
-{
-	rq->nr_running++;
-	if (rt_task(p)) {
-		if (p != rq->migration_thread)
-			/*
-			 * The migration thread does the actual balancing. Do
-			 * not bias by its priority as the ultra high priority
-			 * will skew balancing adversely.
-			 */
-			inc_prio_bias(rq, p->prio);
-	} else
-		inc_prio_bias(rq, p->static_prio);
-}
-
-static inline void dec_nr_running(task_t *p, runqueue_t *rq)
-{
-	rq->nr_running--;
-	if (rt_task(p)) {
-		if (p != rq->migration_thread)
-			dec_prio_bias(rq, p->prio);
-	} else
-		dec_prio_bias(rq, p->static_prio);
-}
-#else
-static inline void inc_prio_bias(runqueue_t *rq, int prio)
-{
-}
-
-static inline void dec_prio_bias(runqueue_t *rq, int prio)
-{
-}
-
-static inline void inc_nr_running(task_t *p, runqueue_t *rq)
-{
-	rq->nr_running++;
-}
-
-static inline void dec_nr_running(task_t *p, runqueue_t *rq)
-{
-	rq->nr_running--;
-}
-#endif
-
 /*
  * __activate_task - move a task to the runqueue.
  */
 static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task(p, rq->active);
-	inc_nr_running(p, rq);
+	rq->nr_running++;
 }
 
 /*
@@ -739,7 +683,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task_head(p, rq->active);
-	inc_nr_running(p, rq);
+	rq->nr_running++;
 }
 
 static int recalc_task_prio(task_t *p, unsigned long long now)
@@ -863,7 +807,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
  */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	dec_nr_running(p, rq);
+	rq->nr_running--;
 	dequeue_task(p, p->array);
 	p->array = NULL;
 }
@@ -1007,61 +951,27 @@ void kick_process(task_t *p)
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static unsigned long __source_load(int cpu, int type, enum idle_type idle)
+static inline unsigned long source_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long running = rq->nr_running;
-	unsigned long source_load, cpu_load = rq->cpu_load[type-1],
-		load_now = running * SCHED_LOAD_SCALE;
-
+	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
 	if (type == 0)
-		source_load = load_now;
-	else
-		source_load = min(cpu_load, load_now);
-
-	if (running > 1 || (idle == NOT_IDLE && running))
-		/*
-		 * If we are busy rebalancing the load is biased by
-		 * priority to create 'nice' support across cpus. When
-		 * idle rebalancing we should only bias the source_load if
-		 * there is more than one task running on that queue to
-		 * prevent idle rebalance from trying to pull tasks from a
-		 * queue with only one running task.
-		 */
-		source_load = source_load * rq->prio_bias / running;
+		return load_now;
 
-	return source_load;
-}
-
-static inline unsigned long source_load(int cpu, int type)
-{
-	return __source_load(cpu, type, NOT_IDLE);
+	return min(rq->cpu_load[type-1], load_now);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu
  */
-static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
+static inline unsigned long target_load(int cpu, int type)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long running = rq->nr_running;
-	unsigned long target_load, cpu_load = rq->cpu_load[type-1],
-		load_now = running * SCHED_LOAD_SCALE;
-
+	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
 	if (type == 0)
-		target_load = load_now;
-	else
-		target_load = max(cpu_load, load_now);
+		return load_now;
 
-	if (running > 1 || (idle == NOT_IDLE && running))
-		target_load = target_load * rq->prio_bias / running;
-
-	return target_load;
-}
-
-static inline unsigned long target_load(int cpu, int type)
-{
-	return __target_load(cpu, type, NOT_IDLE);
+	return max(rq->cpu_load[type-1], load_now);
 }
 
 /*
@@ -1530,7 +1440,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
 				list_add_tail(&p->run_list, &current->run_list);
 				p->array = current->array;
 				p->array->nr_active++;
-				inc_nr_running(p, rq);
+				rq->nr_running++;
 			}
 			set_need_resched();
 		} else
@@ -1875,9 +1785,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
 	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
-	dec_nr_running(p, src_rq);
+	src_rq->nr_running--;
 	set_task_cpu(p, this_cpu);
-	inc_nr_running(p, this_rq);
+	this_rq->nr_running++;
 	enqueue_task(p, this_array);
 	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 				+ this_rq->timestamp_last_tick;
@@ -2056,9 +1966,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = __target_load(i, load_idx, idle);
+				load = target_load(i, load_idx);
 			else
-				load = __source_load(i, load_idx, idle);
+				load = source_load(i, load_idx);
 
 			avg_load += load;
 		}
@@ -2171,7 +2081,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = __source_load(i, 0, idle);
+		load = source_load(i, 0);
 
 		if (load > max_load) {
 			max_load = load;
@@ -3571,10 +3481,8 @@ void set_user_nice(task_t *p, long nice)
 		goto out_unlock;
 	}
 	array = p->array;
-	if (array) {
+	if (array)
 		dequeue_task(p, array);
-		dec_prio_bias(rq, p->static_prio);
-	}
 
 	old_prio = p->prio;
 	new_prio = NICE_TO_PRIO(nice);
@@ -3584,7 +3492,6 @@ void set_user_nice(task_t *p, long nice)
 
 	if (array) {
 		enqueue_task(p, array);
-		inc_prio_bias(rq, p->static_prio);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
-- 
cgit v1.2.3-70-g09d2


From c22db9412736204b25aeba19d18e5ea922f7d632 Mon Sep 17 00:00:00 2001
From: Jan Beulich <jbeulich@novell.com>
Date: Fri, 10 Feb 2006 01:51:11 -0800
Subject: [PATCH] prevent recursive panic from softlockup watchdog

When panic_timeout is zero, suppress triggering a nested panic due to soft
lockup detection.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/panic.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index c5c4ab25583..126dc43f1c7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -130,6 +130,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 #endif
 	local_irq_enable();
 	for (i = 0;;) {
+		touch_softlockup_watchdog();
 		i += panic_blink(i);
 		mdelay(1);
 		i++;
-- 
cgit v1.2.3-70-g09d2


From 16bf134840da3920ded1290973c56ec214636f12 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Tue, 14 Feb 2006 13:52:59 -0800
Subject: [PATCH] compound page: no access_process_vm check

The PageCompound check before access_process_vm's set_page_dirty_lock is no
longer necessary, so remove it.  But leave the PageCompound checks in
bio_set_pages_dirty, dio_bio_complete and nfs_free_user_pages: at least some
of those were introduced as a little optimization on hugetlb pages.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/ptrace.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 5f33cdb6fff..d2cf144d0af 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -242,8 +242,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
 		if (write) {
 			copy_to_user_page(vma, page, addr,
 					  maddr + offset, buf, bytes);
-			if (!PageCompound(page))
-				set_page_dirty_lock(page);
+			set_page_dirty_lock(page);
 		} else {
 			copy_from_user_page(vma, page, addr,
 					    buf, maddr + offset, bytes);
-- 
cgit v1.2.3-70-g09d2


From d6077cb80cde4506720f9165eba99ee07438513f Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Tue, 14 Feb 2006 13:53:10 -0800
Subject: [PATCH] sched: revert "filter affine wakeups"

Revert commit d7102e95b7b9c00277562c29aad421d2d521c5f6:

    [PATCH] sched: filter affine wakeups

Apparently caused more than 10% performance regression for aim7 benchmark.
The setup in use is 16-cpu HP rx8620, 64Gb of memory and 12 MSA1000s with 144
disks.  Each disk is 72Gb with a single ext3 filesystem (courtesy of HP, who
supplied benchmark results).

The problem is, for aim7, the wake-up pattern is random, but it still needs
load balancing action in the wake-up path to achieve best performance.  With
the above commit, lack of load balancing hurts that workload.

However, for workloads like database transaction processing, the requirement
is exactly opposite.  In the wake up path, best performance is achieved with
absolutely zero load balancing.  We simply wake up the process on the CPU that
it was previously run.  Worst performance is obtained when we do load
balancing at wake up.

There isn't an easy way to auto detect the workload characteristics.  Ingo's
earlier patch that detects idle CPU and decide whether to load balance or not
doesn't perform with aim7 either since all CPUs are busy (it causes even
bigger perf.  regression).

Revert commit d7102e95b7b9c00277562c29aad421d2d521c5f6, which causes more
than 10% performance regression with aim7.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |  5 +----
 kernel/sched.c        | 10 +---------
 2 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9c1da0269a1..b6f51e3a38e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -697,11 +697,8 @@ struct task_struct {
 
 	int lock_depth;		/* BKL lock depth */
 
-#if defined(CONFIG_SMP)
-	int last_waker_cpu;	/* CPU that last woke this task up */
-#if defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	int oncpu;
-#endif
 #endif
 	int prio, static_prio;
 	struct list_head run_list;
diff --git a/kernel/sched.c b/kernel/sched.c
index 87d93be336a..66d957227de 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1204,9 +1204,6 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
 		}
 	}
 
-	if (p->last_waker_cpu != this_cpu)
-		goto out_set_cpu;
-
 	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
 		goto out_set_cpu;
 
@@ -1277,8 +1274,6 @@ out_set_cpu:
 		cpu = task_cpu(p);
 	}
 
-	p->last_waker_cpu = this_cpu;
-
 out_activate:
 #endif /* CONFIG_SMP */
 	if (old_state == TASK_UNINTERRUPTIBLE) {
@@ -1360,12 +1355,9 @@ void fastcall sched_fork(task_t *p, int clone_flags)
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#if defined(CONFIG_SMP)
-	p->last_waker_cpu = cpu;
-#if defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	p->oncpu = 0;
 #endif
-#endif
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
-- 
cgit v1.2.3-70-g09d2


From 06027bdd278a32a84b273e41db68a5db8ffd2bb6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 14 Feb 2006 13:53:15 -0800
Subject: [PATCH] hrtimer: round up relative start time on low-res arches

CONFIG_TIME_LOW_RES is a temporary way for architectures to signal that
they simply return xtime in do_gettimeoffset().  In this corner-case we
want to round up by resolution when starting a relative timer, to avoid
short timeouts.  This will go away with the GTOD framework.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Roman Zippel <zippel@linux-m68k.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/Kconfig       |  4 ++++
 arch/h8300/Kconfig     |  4 ++++
 arch/m68k/Kconfig      |  4 ++++
 arch/m68knommu/Kconfig |  4 ++++
 arch/parisc/Kconfig    |  5 +++++
 arch/v850/Kconfig      |  4 ++++
 kernel/hrtimer.c       | 13 ++++++++++++-
 7 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index 60a617aff8b..e0838371237 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -25,6 +25,10 @@ config GENERIC_HARDIRQS
 	bool
 	default n
 
+config TIME_LOW_RES
+	bool
+	default y
+
 mainmenu "Fujitsu FR-V Kernel Configuration"
 
 source "init/Kconfig"
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 80940d712ac..98308b018a3 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -33,6 +33,10 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
+config TIME_LOW_RES
+	bool
+	default y
+
 config ISA
 	bool
 	default y
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 96b91982805..8849439e88d 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -21,6 +21,10 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
+config TIME_LOW_RES
+	bool
+	default y
+
 config ARCH_MAY_HAVE_PC_FDC
 	bool
 	depends on Q40 || (BROKEN && SUN3X)
diff --git a/arch/m68knommu/Kconfig b/arch/m68knommu/Kconfig
index e2a6e864896..e50858dbc23 100644
--- a/arch/m68knommu/Kconfig
+++ b/arch/m68knommu/Kconfig
@@ -29,6 +29,10 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
+config TIME_LOW_RES
+	bool
+	default y
+
 source "init/Kconfig"
 
 menu "Processor type and features"
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 7c914a4c67c..eca33cfa8a4 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -29,6 +29,11 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
 
+config TIME_LOW_RES
+	bool
+	depends on SMP
+	default y
+
 config GENERIC_ISA_DMA
 	bool
 
diff --git a/arch/v850/Kconfig b/arch/v850/Kconfig
index 04494638b96..e7fc3e50034 100644
--- a/arch/v850/Kconfig
+++ b/arch/v850/Kconfig
@@ -28,6 +28,10 @@ config GENERIC_IRQ_PROBE
 	bool
 	default y
 
+config TIME_LOW_RES
+	bool
+	default y
+
 # Turn off some random 386 crap that can affect device config
 config ISA
 	bool
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2b6e1757aed..5ae51f1bc7c 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -418,8 +418,19 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 	/* Switch the timer base, if necessary: */
 	new_base = switch_hrtimer_base(timer, base);
 
-	if (mode == HRTIMER_REL)
+	if (mode == HRTIMER_REL) {
 		tim = ktime_add(tim, new_base->get_time());
+		/*
+		 * CONFIG_TIME_LOW_RES is a temporary way for architectures
+		 * to signal that they simply return xtime in
+		 * do_gettimeoffset(). In this case we want to round up by
+		 * resolution when starting a relative timer, to avoid short
+		 * timeouts. This will go away with the GTOD framework.
+		 */
+#ifdef CONFIG_TIME_LOW_RES
+		tim = ktime_add(tim, base->resolution);
+#endif
+	}
 	timer->expires = tim;
 
 	enqueue_hrtimer(timer, new_base);
-- 
cgit v1.2.3-70-g09d2


From 3f17da699431ec48540beabc55c54d4b5e66c8e7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 15 Feb 2006 22:13:24 +0300
Subject: [PATCH] fix kill_proc_info() vs CLONE_THREAD race

There is a window after copy_process() unlocks ->sighand.siglock
and before it adds the new thread to the thread list.

In that window __group_complete_signal(SIGKILL) will not see the
new thread yet, so this thread will start running while the whole
thread group was supposed to exit.

I beleive we have another good reason to place attach_pid(PID/TGID)
under ->sighand.siglock. We can do the same for

	release_task()->__unhash_process()

	de_thread()->switch_exec_pids()

After that we don't need tasklist_lock to iterate over the thread
list, and we can simplify things, see for example do_sigaction()
or sys_times().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 8e88b374cee..3683ce10f4a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1123,8 +1123,8 @@ static task_t *copy_process(unsigned long clone_flags,
 		p->real_parent = current;
 	p->parent = p->real_parent;
 
+	spin_lock(&current->sighand->siglock);
 	if (clone_flags & CLONE_THREAD) {
-		spin_lock(&current->sighand->siglock);
 		/*
 		 * Important: if an exit-all has been started then
 		 * do not create this new thread - the whole thread
@@ -1162,8 +1162,6 @@ static task_t *copy_process(unsigned long clone_flags,
 			 */
 			p->it_prof_expires = jiffies_to_cputime(1);
 		}
-
-		spin_unlock(&current->sighand->siglock);
 	}
 
 	/*
@@ -1189,6 +1187,7 @@ static task_t *copy_process(unsigned long clone_flags,
 
 	nr_threads++;
 	total_forks++;
+	spin_unlock(&current->sighand->siglock);
 	write_unlock_irq(&tasklist_lock);
 	proc_fork_connector(p);
 	return p;
-- 
cgit v1.2.3-70-g09d2


From dadac81b1b86196fcc48fb87620403c4a7174f06 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 15 Feb 2006 22:13:26 +0300
Subject: [PATCH] fix kill_proc_info() vs fork() theoretical race

copy_process:

	attach_pid(p, PIDTYPE_PID, p->pid);
	attach_pid(p, PIDTYPE_TGID, p->tgid);

What if kill_proc_info(p->pid) happens in between?

copy_process() holds current->sighand.siglock, so we are safe
in CLONE_THREAD case, because current->sighand == p->sighand.

Otherwise, p->sighand is unlocked, the new process is already
visible to the find_task_by_pid(), but have a copy of parent's
'struct pid' in ->pids[PIDTYPE_TGID].

This means that __group_complete_signal() may hang while doing

	do ... while (next_thread() != p)

We can solve this problem if we reverse these 2 attach_pid()s:

	attach_pid() does wmb()

	group_send_sig_info() calls spin_lock(), which
	provides a read barrier. // Yes ?

I don't think we can hit this race in practice, but still.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 3683ce10f4a..fbea12d7a94 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1173,8 +1173,6 @@ static task_t *copy_process(unsigned long clone_flags,
 	if (unlikely(p->ptrace & PT_PTRACED))
 		__ptrace_link(p, current->parent);
 
-	attach_pid(p, PIDTYPE_PID, p->pid);
-	attach_pid(p, PIDTYPE_TGID, p->tgid);
 	if (thread_group_leader(p)) {
 		p->signal->tty = current->signal->tty;
 		p->signal->pgrp = process_group(current);
@@ -1184,6 +1182,8 @@ static task_t *copy_process(unsigned long clone_flags,
 		if (p->pid)
 			__get_cpu_var(process_counts)++;
 	}
+	attach_pid(p, PIDTYPE_TGID, p->tgid);
+	attach_pid(p, PIDTYPE_PID, p->pid);
 
 	nr_threads++;
 	total_forks++;
-- 
cgit v1.2.3-70-g09d2


From 5ecfbae093f0c37311e89b29bfc0c9d586eace87 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 15 Feb 2006 22:50:10 +0300
Subject: [PATCH] fix zap_thread's ptrace related problems

1. The tracee can go from ptrace_stop() to do_signal_stop()
   after __ptrace_unlink(p).

2. It is unsafe to __ptrace_unlink(p) while p->parent may wait
   for tasklist_lock in ptrace_detach().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c              |  2 +-
 include/linux/ptrace.h |  1 +
 kernel/ptrace.c        | 25 +++++++++++++++----------
 3 files changed, 17 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 055378d2513..0e1c95074d4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1403,7 +1403,7 @@ static void zap_threads (struct mm_struct *mm)
 		do_each_thread(g,p) {
 			if (mm == p->mm && p != tsk &&
 			    p->ptrace && p->parent->mm == mm) {
-				__ptrace_unlink(p);
+				__ptrace_detach(p, 0);
 			}
 		} while_each_thread(g,p);
 		write_unlock_irq(&tasklist_lock);
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 9d5cd106b34..0d36750fc0f 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -84,6 +84,7 @@ extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __us
 extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
 extern int ptrace_attach(struct task_struct *tsk);
 extern int ptrace_detach(struct task_struct *, unsigned int);
+extern void __ptrace_detach(struct task_struct *, unsigned int);
 extern void ptrace_disable(struct task_struct *);
 extern int ptrace_check_attach(struct task_struct *task, int kill);
 extern int ptrace_request(struct task_struct *child, long request, long addr, long data);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index d2cf144d0af..d95a72c9279 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -72,8 +72,8 @@ void ptrace_untrace(task_t *child)
  */
 void __ptrace_unlink(task_t *child)
 {
-	if (!child->ptrace)
-		BUG();
+	BUG_ON(!child->ptrace);
+
 	child->ptrace = 0;
 	if (!list_empty(&child->ptrace_list)) {
 		list_del_init(&child->ptrace_list);
@@ -184,22 +184,27 @@ bad:
 	return retval;
 }
 
+void __ptrace_detach(struct task_struct *child, unsigned int data)
+{
+	child->exit_code = data;
+	/* .. re-parent .. */
+	__ptrace_unlink(child);
+	/* .. and wake it up. */
+	if (child->exit_state != EXIT_ZOMBIE)
+		wake_up_process(child);
+}
+
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
 	if (!valid_signal(data))
-		return	-EIO;
+		return -EIO;
 
 	/* Architecture-specific hardware disable .. */
 	ptrace_disable(child);
 
-	/* .. re-parent .. */
-	child->exit_code = data;
-
 	write_lock_irq(&tasklist_lock);
-	__ptrace_unlink(child);
-	/* .. and wake it up. */
-	if (child->exit_state != EXIT_ZOMBIE)
-		wake_up_process(child);
+	if (child->ptrace)
+		__ptrace_detach(child, data);
 	write_unlock_irq(&tasklist_lock);
 
 	return 0;
-- 
cgit v1.2.3-70-g09d2


From 06fed33849c13af637c4d09e9ba27828fac9edd5 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 15 Feb 2006 15:17:38 -0800
Subject: [PATCH] cpuset: oops in exit on null cpuset fix

Fix a latent bug in cpuset_exit() handling.  If a task tried to allocate
memory after calling cpuset_exit(), it oops'd in
cpuset_update_task_memory_state() on a NULL cpuset pointer.

So set the exiting tasks cpuset to the root cpuset instead of to NULL.

A distro kernel hit this with an added kernel package that had just such a
hook (allocating memory) in the exit code path.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba42b0a7696..12815d3f1a0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1977,6 +1977,39 @@ void cpuset_fork(struct task_struct *child)
  * We don't need to task_lock() this reference to tsk->cpuset,
  * because tsk is already marked PF_EXITING, so attach_task() won't
  * mess with it, or task is a failed fork, never visible to attach_task.
+ *
+ * Hack:
+ *
+ *    Set the exiting tasks cpuset to the root cpuset (top_cpuset).
+ *
+ *    Don't leave a task unable to allocate memory, as that is an
+ *    accident waiting to happen should someone add a callout in
+ *    do_exit() after the cpuset_exit() call that might allocate.
+ *    If a task tries to allocate memory with an invalid cpuset,
+ *    it will oops in cpuset_update_task_memory_state().
+ *
+ *    We call cpuset_exit() while the task is still competent to
+ *    handle notify_on_release(), then leave the task attached to
+ *    the root cpuset (top_cpuset) for the remainder of its exit.
+ *
+ *    To do this properly, we would increment the reference count on
+ *    top_cpuset, and near the very end of the kernel/exit.c do_exit()
+ *    code we would add a second cpuset function call, to drop that
+ *    reference.  This would just create an unnecessary hot spot on
+ *    the top_cpuset reference count, to no avail.
+ *
+ *    Normally, holding a reference to a cpuset without bumping its
+ *    count is unsafe.   The cpuset could go away, or someone could
+ *    attach us to a different cpuset, decrementing the count on
+ *    the first cpuset that we never incremented.  But in this case,
+ *    top_cpuset isn't going away, and either task has PF_EXITING set,
+ *    which wards off any attach_task() attempts, or task is a failed
+ *    fork, never visible to attach_task.
+ *
+ *    Another way to do this would be to set the cpuset pointer
+ *    to NULL here, and check in cpuset_update_task_memory_state()
+ *    for a NULL pointer.  This hack avoids that NULL check, for no
+ *    cost (other than this way too long comment ;).
  **/
 
 void cpuset_exit(struct task_struct *tsk)
@@ -1984,7 +2017,7 @@ void cpuset_exit(struct task_struct *tsk)
 	struct cpuset *cs;
 
 	cs = tsk->cpuset;
-	tsk->cpuset = NULL;
+	tsk->cpuset = &top_cpuset;	/* Hack - see comment above */
 
 	if (notify_on_release(cs)) {
 		char *pathbuf = NULL;
-- 
cgit v1.2.3-70-g09d2


From c8adb494a6df6b2be8e50a8dafd5bab231df3505 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 15 Feb 2006 15:17:42 -0800
Subject: [PATCH] swsusp: nuke noisy message

I get about 88 squillion of these when suspending an old ad450nx server.

Cc: Pavel Roskin <proski@gnu.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 41f66365f0d..8d5a5986d62 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -91,10 +91,8 @@ static int save_highmem_zone(struct zone *zone)
 		 * corrected eventually when the cases giving rise to this
 		 * are better understood.
 		 */
-		if (PageReserved(page)) {
-			printk("highmem reserved page?!\n");
+		if (PageReserved(page))
 			continue;
-		}
 		BUG_ON(PageNosave(page));
 		if (PageNosaveFree(page))
 			continue;
-- 
cgit v1.2.3-70-g09d2


From a62eaf151d9cb478d127cfbc2e93c498869785b0 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Thu, 16 Feb 2006 23:41:58 +0100
Subject: [PATCH] x86_64: Add boot option to disable randomized mappings and
 cleanup

AMD SimNow!'s JIT doesn't like them at all in the guest. For distribution
installation it's easiest if it's a boot time option.

Also I moved the variable to a more appropiate place and make
it independent from sysctl

And marked __read_mostly which it is.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt |  3 +++
 arch/i386/kernel/cpu/transmeta.c    |  1 +
 include/linux/kernel.h              |  6 ------
 include/linux/mm.h                  |  2 ++
 kernel/sysctl.c                     |  2 --
 mm/memory.c                         | 10 ++++++++++
 6 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index ac75b57edf2..b874771385c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1638,6 +1638,9 @@ running once the system is up.
 			Format:
 			<irq>,<irq_mask>,<io>,<full_duplex>,<do_sound>,<lockup_hack>[,<irq2>[,<irq3>[,<irq4>]]]
 
+	norandmaps	Don't use address space randomization
+			Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space
+
 
 ______________________________________________________________________
 Changelog:
diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c
index bdbeb77f4e2..7214c9b577a 100644
--- a/arch/i386/kernel/cpu/transmeta.c
+++ b/arch/i386/kernel/cpu/transmeta.c
@@ -1,4 +1,5 @@
 #include <linux/kernel.h>
+#include <linux/mm.h>
 #include <linux/init.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index b49affa0ac5..3b507bf05d0 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -326,12 +326,6 @@ struct sysinfo {
 /* Force a compilation error if condition is true */
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
 
-#ifdef CONFIG_SYSCTL
-extern int randomize_va_space;
-#else
-#define randomize_va_space 1
-#endif
-
 /* Trap pasters of __FUNCTION__ at compile-time */
 #define __FUNCTION__ (__func__)
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 75e9f072499..26e1663a5cb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1051,5 +1051,7 @@ int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 void drop_pagecache(void);
 void drop_slab(void);
 
+extern int randomize_va_space;
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 71dd6f62efe..7654d55c47f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -126,8 +126,6 @@ extern int sysctl_hz_timer;
 extern int acct_parm[];
 #endif
 
-int randomize_va_space = 1;
-
 static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
 		       ctl_table *, void **);
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
diff --git a/mm/memory.c b/mm/memory.c
index 2bee1f21aa8..9abc6008544 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages);
 EXPORT_SYMBOL(high_memory);
 EXPORT_SYMBOL(vmalloc_earlyreserve);
 
+int randomize_va_space __read_mostly = 1;
+
+static int __init disable_randmaps(char *s)
+{
+	randomize_va_space = 0;
+	return 0;
+}
+__setup("norandmaps", disable_randmaps);
+
+
 /*
  * If a p?d_bad entry is found while walking page tables, report
  * the error, before resetting entry to p?d_none.  Usually (but
-- 
cgit v1.2.3-70-g09d2


From 726c14bf499e91e7ede4f1728830aba05c675061 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 17 Feb 2006 10:30:23 +1100
Subject: [PATCH] Provide an interface for getting the current tick length

This provides an interface for arch code to find out how many
nanoseconds are going to be added on to xtime by the next call to
do_timer.  The value returned is a fixed-point number in 52.12 format
in nanoseconds.  The reason for this format is that it gives the
full precision that the timekeeping code is using internally.

The motivation for this is to fix a problem that has arisen on 32-bit
powerpc in that the value returned by do_gettimeofday drifts apart
from xtime if NTP is being used.  PowerPC is now using a lockless
do_gettimeofday based on reading the timebase register and performing
some simple arithmetic.  (This method of getting the time is also
exported to userspace via the VDSO.)  However, the factor and offset
it uses were calculated based on the nominal tick length and weren't
being adjusted when NTP varied the tick length.

Note that 64-bit powerpc has had the lockless do_gettimeofday for a
long time now.  It also had an extremely hairy routine that got called
from the 32-bit compat routine for adjtimex, which adjusted the
factor and offset according to what it thought the timekeeping code
was going to do.  Not only was this only called if a 32-bit task did
adjtimex (i.e. not if a 64-bit task did adjtimex), it was also
duplicating computations from kernel/timer.c and it wasn't clear that
it was (still) correct.

The simple solution is to ask the timekeeping code how long the
current jiffy will be on each timer interrupt, after calling
do_timer.  If this jiffy will be a different length from the last one,
we then need to compute new values for the factor and offset used in
the lockless do_gettimeofday.  In this way we can keep xtime and
do_gettimeofday in sync, even when NTP is varying the tick length.

Note that when adjtimex varies the tick length, it almost always
introduces the variation from the next tick on.  The only case I could
see where adjtimex would vary the length of the current tick is when
an old-style adjtime adjustment is being cancelled.  (It's not clear
to me why the adjustment has to be cancelled immediately rather than
from the next tick on.)  Thus I don't see any real need for a hook in
adjtimex; the rare case of an old-style adjustment being cancelled can
be fixed up at the next tick.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: john stultz <johnstul@us.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timex.h |  3 +++
 kernel/timer.c        | 39 ++++++++++++++++++++++++++++++++++-----
 2 files changed, 37 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 04a4a8cb4ed..b7ca1204e42 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -345,6 +345,9 @@ time_interpolator_reset(void)
 
 #endif /* !CONFIG_TIME_INTERPOLATION */
 
+/* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */
+extern u64 current_tick_length(void);
+
 #endif /* KERNEL */
 
 #endif /* LINUX_TIMEX_H */
diff --git a/kernel/timer.c b/kernel/timer.c
index b9dad399467..fe3a9a9f832 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -717,12 +717,16 @@ static void second_overflow(void)
 #endif
 }
 
-/* in the NTP reference this is called "hardclock()" */
-static void update_wall_time_one_tick(void)
+/*
+ * Returns how many microseconds we need to add to xtime this tick
+ * in doing an adjustment requested with adjtime.
+ */
+static long adjtime_adjustment(void)
 {
-	long time_adjust_step, delta_nsec;
+	long time_adjust_step;
 
-	if ((time_adjust_step = time_adjust) != 0 ) {
+	time_adjust_step = time_adjust;
+	if (time_adjust_step) {
 		/*
 		 * We are doing an adjtime thing.  Prepare time_adjust_step to
 		 * be within bounds.  Note that a positive time_adjust means we
@@ -733,10 +737,19 @@ static void update_wall_time_one_tick(void)
 		 */
 		time_adjust_step = min(time_adjust_step, (long)tickadj);
 		time_adjust_step = max(time_adjust_step, (long)-tickadj);
+	}
+	return time_adjust_step;
+}
 
+/* in the NTP reference this is called "hardclock()" */
+static void update_wall_time_one_tick(void)
+{
+	long time_adjust_step, delta_nsec;
+
+	time_adjust_step = adjtime_adjustment();
+	if (time_adjust_step)
 		/* Reduce by this step the amount of time left  */
 		time_adjust -= time_adjust_step;
-	}
 	delta_nsec = tick_nsec + time_adjust_step * 1000;
 	/*
 	 * Advance the phase, once it gets to one microsecond, then
@@ -758,6 +771,22 @@ static void update_wall_time_one_tick(void)
 	}
 }
 
+/*
+ * Return how long ticks are at the moment, that is, how much time
+ * update_wall_time_one_tick will add to xtime next time we call it
+ * (assuming no calls to do_adjtimex in the meantime).
+ * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10
+ * bits to the right of the binary point.
+ * This function has no side-effects.
+ */
+u64 current_tick_length(void)
+{
+	long delta_nsec;
+
+	delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
+	return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj;
+}
+
 /*
  * Using a loop looks inefficient, but "ticks" is
  * usually just one (we shouldn't be losing ticks,
-- 
cgit v1.2.3-70-g09d2


From 4bbf39c29bc3409d6454faf0dfa1b3b0aa2ac2af Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 17 Feb 2006 13:52:44 -0800
Subject: [PATCH] Introduce CONFIG_DEFAULT_MIGRATION_COST

Heiko Carstens <heiko.carstens@de.ibm.com> wrote:

  The boot sequence on s390 sometimes takes ages and we spend a very long
  time (up to one or two minutes) in calibrate_migration_costs.  The time
  spent there differs from boot to boot.  Also the calculated costs differ
  a lot.  I've seen differences by up to a factor of 15 (yes, factor not
  percent).  Also I doubt that making these measurements make much sense on
  a completely virtualized architecture where you cannot tell how much cpu
  time you will get anyway.

So introduce the CONFIG_DEFAULT_MIGRATION_COST method for an architecture
to set the scheduler migration costs.  This turns off automatic detection
of migration costs.  Makes sense on virtual platforms, where migration
costs are hard to measure accurately.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/Kconfig |  4 ++++
 kernel/sched.c    | 13 ++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index b66602ad7b3..b7ca5bf9acf 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -80,6 +80,10 @@ config HOTPLUG_CPU
 	  can be controlled through /sys/devices/system/cpu/cpu#.
 	  Say N if you want to disable CPU hotplug.
 
+config DEFAULT_MIGRATION_COST
+	int
+	default "1000000"
+
 config MATHEMU
 	bool "IEEE FPU emulation"
 	depends on MARCH_G5
diff --git a/kernel/sched.c b/kernel/sched.c
index 66d957227de..12d291bf337 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5058,7 +5058,18 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
 #define MAX_DOMAIN_DISTANCE 32
 
 static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] =
-		{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -1LL };
+		{ [ 0 ... MAX_DOMAIN_DISTANCE-1 ] =
+/*
+ * Architectures may override the migration cost and thus avoid
+ * boot-time calibration. Unit is nanoseconds. Mostly useful for
+ * virtualized hardware:
+ */
+#ifdef CONFIG_DEFAULT_MIGRATION_COST
+			CONFIG_DEFAULT_MIGRATION_COST
+#else
+			-1LL
+#endif
+};
 
 /*
  * Allow override of migration cost - in units of microseconds.
-- 
cgit v1.2.3-70-g09d2


From a8534adb74e23374889b84b3d97eb18da542a1b5 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Fri, 17 Feb 2006 13:52:51 -0800
Subject: [PATCH] swsusp: fix breakage with swap on LVM

Restore the compatibility with the older code and make it possible to
suspend if the kernel command line doesn't contain the "resume=" argument

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 4e90905f0e8..2d9d08f72f7 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -153,13 +153,11 @@ static int swsusp_swap_check(void) /* This is called before saving image */
 {
 	int i;
 
-	if (!swsusp_resume_device)
-		return -ENODEV;
 	spin_lock(&swap_lock);
 	for (i = 0; i < MAX_SWAPFILES; i++) {
 		if (!(swap_info[i].flags & SWP_WRITEOK))
 			continue;
-		if (is_resume_device(swap_info + i)) {
+		if (!swsusp_resume_device || is_resume_device(swap_info + i)) {
 			spin_unlock(&swap_lock);
 			root_swap = i;
 			return 0;
-- 
cgit v1.2.3-70-g09d2


From ef20c8c197df9b8d5bd4af0679123826da028861 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Feb 2006 15:41:50 -0500
Subject: [PATCH] GFP_KERNEL allocations in atomic (auditsc)

audit_log_exit() is called from atomic contexts and gets explicit
gfp_mask argument; it should use it for all allocations rather
than doing some with gfp_mask and some with GFP_KERNEL.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 685c25175d9..d7e7e637b92 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -841,7 +841,7 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 
 	for (aux = context->aux; aux; aux = aux->next) {
 
-		ab = audit_log_start(context, GFP_KERNEL, aux->type);
+		ab = audit_log_start(context, gfp_mask, aux->type);
 		if (!ab)
 			continue; /* audit_panic has been called */
 
@@ -878,14 +878,14 @@ static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 	}
 
 	if (context->pwd && context->pwdmnt) {
-		ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
+		ab = audit_log_start(context, gfp_mask, AUDIT_CWD);
 		if (ab) {
 			audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
 			audit_log_end(ab);
 		}
 	}
 	for (i = 0; i < context->name_count; i++) {
-		ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+		ab = audit_log_start(context, gfp_mask, AUDIT_PATH);
 		if (!ab)
 			continue; /* audit_panic has been called */
 
-- 
cgit v1.2.3-70-g09d2


From c255d844dd73616f23e4b4733edcc2e5fa4042b2 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Mon, 20 Feb 2006 18:27:58 -0800
Subject: [PATCH] suspend-to-ram: allow video options to be set at runtime

Currently, acpi video options can only be set on kernel command line.  That's
little inflexible; I'd like userland s2ram application that just works, and
modifying kernel command line according to whitelist is not fun.  It is better
to just allow s2ram application to set video options just before suspend
(according to the whitelist).

This implements sysctl to allow setting suspend video options without reboot.

(akpm: Documentation updates for this new sysctl are pending..)

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: "Brown, Len" <len.brown@intel.com>
Cc: "Antonino A. Daplas" <adaplas@pol.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sysctl/kernel.txt | 10 ++++++++++
 include/linux/acpi.h            |  3 ++-
 include/linux/sysctl.h          |  1 +
 kernel/sysctl.c                 | 16 ++++++++++++----
 4 files changed, 25 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index 9f11d36a8c1..b0c7ab93dcb 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -16,6 +16,7 @@ before actually making adjustments.
 
 Currently, these files might (depending on your configuration)
 show up in /proc/sys/kernel:
+- acpi_video_flags
 - acct
 - core_pattern
 - core_uses_pid
@@ -57,6 +58,15 @@ show up in /proc/sys/kernel:
 
 ==============================================================
 
+acpi_video_flags:
+
+flags
+
+See Doc*/kernel/power/video.txt, it allows mode of video boot to be
+set during run time.
+
+==============================================================
+
 acct:
 
 highwater lowwater frequency
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 84d3d9f034c..d3bc25e6d27 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -427,7 +427,8 @@ extern int acpi_mp_config;
 extern struct acpi_table_mcfg_config *pci_mmcfg_config;
 extern int pci_mmcfg_config_num;
 
-extern int sbf_port ;
+extern int sbf_port;
+extern unsigned long acpi_video_flags;
 
 #else	/* !CONFIG_ACPI */
 
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 32a4139c4ad..0e92bf7ec28 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -146,6 +146,7 @@ enum
 	KERN_RANDOMIZE=68, /* int: randomize virtual address space */
 	KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */
 	KERN_SPIN_RETRY=70,	/* int: number of spinlock retries */
+	KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7654d55c47f..ebc41bf22f1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -44,14 +44,12 @@
 #include <linux/limits.h>
 #include <linux/dcache.h>
 #include <linux/syscalls.h>
+#include <linux/nfs_fs.h>
+#include <linux/acpi.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 
-#ifdef CONFIG_ROOT_NFS
-#include <linux/nfs_fs.h>
-#endif
-
 #if defined(CONFIG_SYSCTL)
 
 /* External variables not in a header file. */
@@ -655,6 +653,16 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#endif
+#ifdef CONFIG_ACPI_SLEEP
+	{
+		.ctl_name	= KERN_ACPI_VIDEO_FLAGS,
+		.procname	= "acpi_video_flags",
+		.data		= &acpi_video_flags,
+		.maxlen		= sizeof (unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 	{ .ctl_name = 0 }
 };
-- 
cgit v1.2.3-70-g09d2


From 7a9166e3b037296366cea6f3c97f705d33e209e6 Mon Sep 17 00:00:00 2001
From: Luke Yang <luke.adi@gmail.com>
Date: Mon, 20 Feb 2006 18:28:07 -0800
Subject: [PATCH] Fix undefined symbols for nommu architecture

Signed-off-by: Luke Yang <luke.adi@gmail.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 4 ++++
 kernel/sysctl.c    | 2 ++
 mm/nommu.c         | 2 ++
 3 files changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 26e1663a5cb..498ff8778fb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1051,7 +1051,11 @@ int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 void drop_pagecache(void);
 void drop_slab(void);
 
+#ifndef CONFIG_MMU
+#define randomize_va_space 0
+#else
 extern int randomize_va_space;
+#endif
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ebc41bf22f1..c05a2b7125e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -636,6 +636,7 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+#if defined(CONFIG_MMU)
 	{
 		.ctl_name	= KERN_RANDOMIZE,
 		.procname	= "randomize_va_space",
@@ -644,6 +645,7 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#endif
 #if defined(CONFIG_S390) && defined(CONFIG_SMP)
 	{
 		.ctl_name	= KERN_SPIN_RETRY,
diff --git a/mm/nommu.c b/mm/nommu.c
index c10262d6823..99d21020ec9 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -57,6 +57,8 @@ EXPORT_SYMBOL(vmalloc);
 EXPORT_SYMBOL(vfree);
 EXPORT_SYMBOL(vmalloc_to_page);
 EXPORT_SYMBOL(vmalloc_32);
+EXPORT_SYMBOL(vmap);
+EXPORT_SYMBOL(vunmap);
 
 /*
  * Handle all mappings that got truncated by a "truncate()"
-- 
cgit v1.2.3-70-g09d2


From 7fd105e758c8d746d57ab7e77f100e096bf153c8 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Mon, 20 Feb 2006 18:28:08 -0800
Subject: [PATCH] Fix compile for CONFIG_SYSVIPC=n or CONFIG_SYSCTL=n

The compat syscalls are added to sys_ni.c since they are not defined if the
above CONFIG options are off.  Also, nfs would not build with CONFIG_SYSCTL
off.

Noticed by Arthur Othieno.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/nfs_fs.h | 2 +-
 kernel/sys_ni.c        | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 547d649b274..b4dc6e2e10c 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -398,7 +398,7 @@ extern struct inode_operations nfs_symlink_inode_operations;
 extern int nfs_register_sysctl(void);
 extern void nfs_unregister_sysctl(void);
 #else
-#define nfs_register_sysctl() do { } while(0)
+#define nfs_register_sysctl() 0
 #define nfs_unregister_sysctl() do { } while(0)
 #endif
 
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 17313b99e53..1067090db6b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -104,6 +104,8 @@ cond_syscall(sys_setreuid16);
 cond_syscall(sys_setuid16);
 cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
+cond_syscall(compat_sys_ipc);
+cond_syscall(compat_sys_sysctl);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
-- 
cgit v1.2.3-70-g09d2


From 5914811acf36c3ff091f860a6964808f668f27d0 Mon Sep 17 00:00:00 2001
From: Bj�rn Steinbrink <B.Steinbrink@gmx.de>
Date: Sat, 18 Feb 2006 18:12:43 +0100
Subject: [PATCH] kjournald keeps reference to namespace
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In daemonize() a new thread gets cleaned up and 'merged' with init_task.
The current fs_struct is handled there, but not the current namespace.

This adds the namespace part.

[ Eric Biederman pointed out the namespace wrappers, and also notes that
  we can't ever count on using our parents namespace because we already
  have called exit_fs(), which is the only way to the namespace from a
  process. ]

Signed-off-by: Björn Steinbrink <B.Steinbrink@gmx.de>
Acked-by: Eric Biederman <ebiederm@xmission.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 93cee367133..531aadca553 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -360,6 +360,9 @@ void daemonize(const char *name, ...)
 	fs = init_task.fs;
 	current->fs = fs;
 	atomic_inc(&fs->count);
+	exit_namespace(current);
+	current->namespace = init_task.namespace;
+	get_namespace(current->namespace);
  	exit_files(current);
 	current->files = init_task.files;
 	atomic_inc(&current->files->count);
-- 
cgit v1.2.3-70-g09d2


From d2b176ed878d4d5fcc0bd35656dfd373f3702af9 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Tue, 28 Feb 2006 09:42:23 -0800
Subject: [IA64] sysctl option to silence unaligned trap warnings

Allow sysadmin to disable all warnings about userland apps
making unaligned accesses by using:
 # echo 1 > /proc/sys/kernel/ignore-unaligned-usertrap
Rather than having to use prctl on a process by process basis.

Default behaivour leaves the warnings enabled.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/ia64/kernel/unaligned.c | 31 ++++++++++++++++++++++++++++---
 include/linux/sysctl.h       |  1 +
 kernel/sysctl.c              | 14 ++++++++++++++
 3 files changed, 43 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
index 11291389684..1e357550c77 100644
--- a/arch/ia64/kernel/unaligned.c
+++ b/arch/ia64/kernel/unaligned.c
@@ -52,6 +52,15 @@ dump (const char *str, void *vp, size_t len)
 #define IA64_FIRST_ROTATING_FR	32
 #define SIGN_EXT9		0xffffffffffffff00ul
 
+/*
+ *  sysctl settable hook which tells the kernel whether to honor the
+ *  IA64_THREAD_UAC_NOPRINT prctl.  Because this is user settable, we want
+ *  to allow the super user to enable/disable this for security reasons
+ *  (i.e. don't allow attacker to fill up logs with unaligned accesses).
+ */
+int no_unaligned_warning;
+static int noprint_warning;
+
 /*
  * For M-unit:
  *
@@ -1324,8 +1333,9 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
 		if ((current->thread.flags & IA64_THREAD_UAC_SIGBUS) != 0)
 			goto force_sigbus;
 
-		if (!(current->thread.flags & IA64_THREAD_UAC_NOPRINT)
-		    && within_logging_rate_limit())
+		if (!no_unaligned_warning &&
+		    !(current->thread.flags & IA64_THREAD_UAC_NOPRINT) &&
+		    within_logging_rate_limit())
 		{
 			char buf[200];	/* comm[] is at most 16 bytes... */
 			size_t len;
@@ -1340,7 +1350,22 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
 			if (user_mode(regs))
 				tty_write_message(current->signal->tty, buf);
 			buf[len-1] = '\0';	/* drop '\r' */
-			printk(KERN_WARNING "%s", buf);	/* watch for command names containing %s */
+			/* watch for command names containing %s */
+			printk(KERN_WARNING "%s", buf);
+		} else {
+			if (no_unaligned_warning && !noprint_warning) {
+				noprint_warning = 1;
+				printk(KERN_WARNING "%s(%d) encountered an "
+				       "unaligned exception which required\n"
+				       "kernel assistance, which degrades "
+				       "the performance of the application.\n"
+				       "Unaligned exception warnings have "
+				       "been disabled by the system "
+				       "administrator\n"
+				       "echo 0 > /proc/sys/kernel/ignore-"
+				       "unaligned-usertrap to re-enable\n",
+				       current->comm, current->pid);
+			}
 		}
 	} else {
 		if (within_logging_rate_limit())
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 0e92bf7ec28..bac61db2645 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -147,6 +147,7 @@ enum
 	KERN_SETUID_DUMPABLE=69, /* int: behaviour of dumps for setuid core */
 	KERN_SPIN_RETRY=70,	/* int: number of spinlock retries */
 	KERN_ACPI_VIDEO_FLAGS=71, /* int: flags for setting up video after ACPI sleep */
+	KERN_IA64_UNALIGNED=72, /* int: ia64 unaligned userland trap enable */
 };
 
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c05a2b7125e..acf6c1550f2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -124,6 +124,10 @@ extern int sysctl_hz_timer;
 extern int acct_parm[];
 #endif
 
+#ifdef CONFIG_IA64
+extern int no_unaligned_warning;
+#endif
+
 static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
 		       ctl_table *, void **);
 static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
@@ -665,6 +669,16 @@ static ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+#endif
+#ifdef CONFIG_IA64
+	{
+		.ctl_name	= KERN_IA64_UNALIGNED,
+		.procname	= "ignore-unaligned-usertrap",
+		.data		= &no_unaligned_warning,
+		.maxlen		= sizeof (int),
+	 	.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 	{ .ctl_name = 0 }
 };
-- 
cgit v1.2.3-70-g09d2


From 7f99f06f01aa9460b5a18f1b0e0900c90d0a84fc Mon Sep 17 00:00:00 2001
From: Stefan Seyfried <seife@suse.de>
Date: Thu, 2 Mar 2006 02:54:34 -0800
Subject: [PATCH] fix acpi_video_flags on x86-64

acpi_video_flags variable is unsigned long, so it should be set as such.
This actually matters on x86-64.

Signed-off-by: Stefan Seyfried <seife@suse.de>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: "Brown, Len" <len.brown@intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index acf6c1550f2..de2d9109194 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -667,7 +667,7 @@ static ctl_table kern_table[] = {
 		.data		= &acpi_video_flags,
 		.maxlen		= sizeof (unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_doulongvec_minmax,
 	},
 #endif
 #ifdef CONFIG_IA64
-- 
cgit v1.2.3-70-g09d2


From 685db65e422bfa523b8a9dacb5a658b42b254f05 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Thu, 2 Mar 2006 02:54:35 -0800
Subject: [PATCH] time_interpolator: Use readq_relaxed() instead of readq().

On some platforms readq performs additional work to make sure I/O is done
in a coherent way.  This is not needed for time retrieval as done by the
time interpolator.  So we can use readq_relaxed instead which will improve
performance.

It affects sparc64 and ia64 only.  Apparently it makes a significant
difference on ia64.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: john stultz <johnstul@us.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index fe3a9a9f832..fc6646fd5aa 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1351,10 +1351,10 @@ static inline u64 time_interpolator_get_cycles(unsigned int src)
 			return x();
 
 		case TIME_SOURCE_MMIO64	:
-			return readq((void __iomem *) time_interpolator->addr);
+			return readq_relaxed((void __iomem *)time_interpolator->addr);
 
 		case TIME_SOURCE_MMIO32	:
-			return readl((void __iomem *) time_interpolator->addr);
+			return readl_relaxed((void __iomem *)time_interpolator->addr);
 
 		default: return get_cycles();
 	}
-- 
cgit v1.2.3-70-g09d2


From 8ba7b0a14b2ec19583bedbcdbea7f1c5008fc922 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Mon, 6 Mar 2006 17:38:49 -0800
Subject: Add early-boot-safety check to cond_resched()

Just to be safe, we should not trigger a conditional reschedule during
the early boot sequence.  We've historically done some questionable
early on, and the safety warnings in __might_sleep() are generally
turned off during that period, so there might be problems lurking.

This affects CONFIG_PREEMPT_VOLUNTARY, which takes over might_sleep() to
cause a voluntary conditional reschedule.

Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 12d291bf337..3454bb869fd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4028,6 +4028,8 @@ static inline void __cond_resched(void)
 	 */
 	if (unlikely(preempt_count()))
 		return;
+	if (unlikely(system_state != SYSTEM_RUNNING))
+		return;
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
 		schedule();
-- 
cgit v1.2.3-70-g09d2


From 69239749e1ac4f3496906aa4267cb9f61ce52c9c Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 6 Mar 2006 15:42:45 -0800
Subject: [PATCH] fix next_timer_interrupt() for hrtimer

Also from Thomas Gleixner <tglx@linutronix.de>

Function next_timer_interrupt() got broken with a recent patch
6ba1b91213e81aa92b5cf7539f7d2a94ff54947c as sys_nanosleep() was moved to
hrtimer.  This broke things as next_timer_interrupt() did not check hrtimer
tree for next event.

Function next_timer_interrupt() is needed with dyntick (CONFIG_NO_IDLE_HZ,
VST) implementations, as the system can be in idle when next hrtimer event
was supposed to happen.  At least ARM and S390 currently use
next_timer_interrupt().

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/kernel/time.c  | 10 ++++++----
 include/linux/hrtimer.h |  4 ++++
 kernel/hrtimer.c        | 35 +++++++++++++++++++++++++++++++++++
 kernel/timer.c          | 16 ++++++++++++++++
 4 files changed, 61 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index d7d932c0286..d6bd435a685 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -422,12 +422,14 @@ static int timer_dyn_tick_disable(void)
 void timer_dyn_reprogram(void)
 {
 	struct dyn_tick_timer *dyn_tick = system_timer->dyn_tick;
+	unsigned long next, seq;
 
-	if (dyn_tick) {
-		write_seqlock(&xtime_lock);
-		if (dyn_tick->state & DYN_TICK_ENABLED)
+	if (dyn_tick && (dyn_tick->state & DYN_TICK_ENABLED)) {
+		next = next_timer_interrupt();
+		do {
+			seq = read_seqbegin(&xtime_lock);
 			dyn_tick->reprogram(next_timer_interrupt() - jiffies);
-		write_sequnlock(&xtime_lock);
+		} while (read_seqretry(&xtime_lock, seq));
 	}
 }
 
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 6361544bb6a..6401c31d6ad 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -116,6 +116,10 @@ extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
 extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
 
+#ifdef CONFIG_NO_IDLE_HZ
+extern ktime_t hrtimer_get_next_event(void);
+#endif
+
 static inline int hrtimer_active(const struct hrtimer *timer)
 {
 	return timer->state == HRTIMER_PENDING;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 5ae51f1bc7c..14bc9cfa639 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -505,6 +505,41 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 	return rem;
 }
 
+#ifdef CONFIG_NO_IDLE_HZ
+/**
+ * hrtimer_get_next_event - get the time until next expiry event
+ *
+ * Returns the delta to the next expiry event or KTIME_MAX if no timer
+ * is pending.
+ */
+ktime_t hrtimer_get_next_event(void)
+{
+	struct hrtimer_base *base = __get_cpu_var(hrtimer_bases);
+	ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
+	unsigned long flags;
+	int i;
+
+	for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
+		struct hrtimer *timer;
+
+		spin_lock_irqsave(&base->lock, flags);
+		if (!base->first) {
+			spin_unlock_irqrestore(&base->lock, flags);
+			continue;
+		}
+		timer = rb_entry(base->first, struct hrtimer, node);
+		delta.tv64 = timer->expires.tv64;
+		spin_unlock_irqrestore(&base->lock, flags);
+		delta = ktime_sub(delta, base->get_time());
+		if (delta.tv64 < mindelta.tv64)
+			mindelta.tv64 = delta.tv64;
+	}
+	if (mindelta.tv64 < 0)
+		mindelta.tv64 = 0;
+	return mindelta;
+}
+#endif
+
 /**
  * hrtimer_init - initialize a timer to the given clock
  *
diff --git a/kernel/timer.c b/kernel/timer.c
index fc6646fd5aa..8256f3f5ec0 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -489,9 +489,21 @@ unsigned long next_timer_interrupt(void)
 	struct list_head *list;
 	struct timer_list *nte;
 	unsigned long expires;
+	unsigned long hr_expires = MAX_JIFFY_OFFSET;
+	ktime_t hr_delta;
 	tvec_t *varray[4];
 	int i, j;
 
+	hr_delta = hrtimer_get_next_event();
+	if (hr_delta.tv64 != KTIME_MAX) {
+		struct timespec tsdelta;
+		tsdelta = ktime_to_timespec(hr_delta);
+		hr_expires = timespec_to_jiffies(&tsdelta);
+		if (hr_expires < 3)
+			return hr_expires + jiffies;
+	}
+	hr_expires += jiffies;
+
 	base = &__get_cpu_var(tvec_bases);
 	spin_lock(&base->t_base.lock);
 	expires = base->timer_jiffies + (LONG_MAX >> 1);
@@ -542,6 +554,10 @@ found:
 		}
 	}
 	spin_unlock(&base->t_base.lock);
+
+	if (time_before(hr_expires, expires))
+		return hr_expires;
+
 	return expires;
 }
 #endif
-- 
cgit v1.2.3-70-g09d2


From 5aee405c662ca644980c184774277fc6d0769a84 Mon Sep 17 00:00:00 2001
From: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Date: Mon, 6 Mar 2006 15:42:51 -0800
Subject: [PATCH] time: add barrier after updating jiffies_64

Add a compiler barrier so that we don't read jiffies before updating
jiffies_64.

Signed-off-by: Atsushi Nemoto <anemo@mba.ocn.ne.jp>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 8256f3f5ec0..bf7c4193b93 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -941,6 +941,8 @@ static inline void update_times(void)
 void do_timer(struct pt_regs *regs)
 {
 	jiffies_64++;
+	/* prevent loading jiffies before storing new jiffies_64 value. */
+	barrier();
 	update_times();
 	softlockup_tick(regs);
 }
-- 
cgit v1.2.3-70-g09d2


From 81c29a857d3c8d6ea9c4f20d196c36bf0a07c615 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 7 Mar 2006 21:55:27 -0800
Subject: [PATCH] idle threads should have a sane ->timestamp value

Idle threads should have a sane ->timestamp value, to avoid init kernel
thread(s) from inheriting it and causing miscalculations in
try_to_wake_up().

Reported-by: Mike Galbraith <efault@gmx.de>.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3454bb869fd..e82c99f1db6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4335,6 +4335,7 @@ void __devinit init_idle(task_t *idle, int cpu)
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long flags;
 
+	idle->timestamp = sched_clock();
 	idle->sleep_avg = 0;
 	idle->array = NULL;
 	idle->prio = MAX_PRIO;
-- 
cgit v1.2.3-70-g09d2


From 21a1ea9eb40411d4ee29448c53b9e4c0654d6ceb Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Tue, 7 Mar 2006 21:55:33 -0800
Subject: [PATCH] rcu batch tuning

This patch adds new tunables for RCU queue and finished batches.  There are
two types of controls - number of completed RCU updates invoked in a batch
(blimit) and monitoring for high rate of incoming RCUs on a cpu (qhimark,
qlowmark).

By default, the per-cpu batch limit is set to a small value.  If the input
RCU rate exceeds the high watermark, we do two things - force quiescent
state on all cpus and set the batch limit of the CPU to INTMAX.  Setting
batch limit to INTMAX forces all finished RCUs to be processed in one shot.
 If we have more than INTMAX RCUs queued up, then we have bigger problems
anyway.  Once the incoming queued RCUs fall below the low watermark, the
batch limit is set to the default.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/kernel-parameters.txt | 13 +++++++
 include/linux/rcupdate.h            |  6 ++-
 kernel/rcupdate.c                   | 76 ++++++++++++++++++++++++++++---------
 3 files changed, 76 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 75205391b33..bad5987c472 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1284,6 +1284,19 @@ running once the system is up.
 			New name for the ramdisk parameter.
 			See Documentation/ramdisk.txt.
 
+	rcu.blimit=	[KNL,BOOT] Set maximum number of finished
+			RCU callbacks to process in one batch.
+
+	rcu.qhimark=	[KNL,BOOT] Set threshold of queued
+			RCU callbacks over which batch limiting is disabled.
+
+	rcu.qlowmark=	[KNL,BOOT] Set threshold of queued
+			RCU callbacks below which batch limiting is re-enabled.
+
+	rcu.rsinterval=	[KNL,BOOT,SMP] Set the number of additional
+			RCU callbacks to queued before forcing reschedule
+			on all cpus.
+
 	rdinit=		[KNL]
 			Format: <full_path>
 			Run specified binary instead of /init from the ramdisk,
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index b87aefa082e..c2ec6c77874 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -98,13 +98,17 @@ struct rcu_data {
 	long  	       	batch;           /* Batch # for current RCU batch */
 	struct rcu_head *nxtlist;
 	struct rcu_head **nxttail;
-	long            count; /* # of queued items */
+	long            qlen; 	 	 /* # of queued callbacks */
 	struct rcu_head *curlist;
 	struct rcu_head **curtail;
 	struct rcu_head *donelist;
 	struct rcu_head **donetail;
+	long		blimit;		 /* Upper limit on a processed batch */
 	int cpu;
 	struct rcu_head barrier;
+#ifdef CONFIG_SMP
+	long		last_rs_qlen;	 /* qlen during the last resched */
+#endif
 };
 
 DECLARE_PER_CPU(struct rcu_data, rcu_data);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 0cf8146bd58..8cf15a569fc 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -67,7 +67,43 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 
 /* Fake initialization required by compiler */
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
-static int maxbatch = 10000;
+static int blimit = 10;
+static int qhimark = 10000;
+static int qlowmark = 100;
+#ifdef CONFIG_SMP
+static int rsinterval = 1000;
+#endif
+
+static atomic_t rcu_barrier_cpu_count;
+static struct semaphore rcu_barrier_sema;
+static struct completion rcu_barrier_completion;
+
+#ifdef CONFIG_SMP
+static void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	int cpu;
+	cpumask_t cpumask;
+	set_need_resched();
+	if (unlikely(rdp->qlen - rdp->last_rs_qlen > rsinterval)) {
+		rdp->last_rs_qlen = rdp->qlen;
+		/*
+		 * Don't send IPI to itself. With irqs disabled,
+		 * rdp->cpu is the current cpu.
+		 */
+		cpumask = rcp->cpumask;
+		cpu_clear(rdp->cpu, cpumask);
+		for_each_cpu_mask(cpu, cpumask)
+			smp_send_reschedule(cpu);
+	}
+}
+#else
+static inline void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	set_need_resched();
+}
+#endif
 
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
@@ -92,17 +128,13 @@ void fastcall call_rcu(struct rcu_head *head,
 	rdp = &__get_cpu_var(rcu_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
-
-	if (unlikely(++rdp->count > 10000))
-		set_need_resched();
-
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_ctrlblk);
+	}
 	local_irq_restore(flags);
 }
 
-static atomic_t rcu_barrier_cpu_count;
-static struct semaphore rcu_barrier_sema;
-static struct completion rcu_barrier_completion;
-
 /**
  * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -131,12 +163,12 @@ void fastcall call_rcu_bh(struct rcu_head *head,
 	rdp = &__get_cpu_var(rcu_bh_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
-	rdp->count++;
-/*
- *  Should we directly call rcu_do_batch() here ?
- *  if (unlikely(rdp->count > 10000))
- *      rcu_do_batch(rdp);
- */
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
+	}
+
 	local_irq_restore(flags);
 }
 
@@ -199,10 +231,12 @@ static void rcu_do_batch(struct rcu_data *rdp)
 		next = rdp->donelist = list->next;
 		list->func(list);
 		list = next;
-		rdp->count--;
-		if (++count >= maxbatch)
+		rdp->qlen--;
+		if (++count >= rdp->blimit)
 			break;
 	}
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
 	if (!rdp->donelist)
 		rdp->donetail = &rdp->donelist;
 	else
@@ -473,6 +507,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 	rdp->quiescbatch = rcp->completed;
 	rdp->qs_pending = 0;
 	rdp->cpu = cpu;
+	rdp->blimit = blimit;
 }
 
 static void __devinit rcu_online_cpu(int cpu)
@@ -567,7 +602,12 @@ void synchronize_kernel(void)
 	synchronize_rcu();
 }
 
-module_param(maxbatch, int, 0);
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
+#ifdef CONFIG_SMP
+module_param(rsinterval, int, 0);
+#endif
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
-- 
cgit v1.2.3-70-g09d2


From 529bf6be5c04f2e869d07bfdb122e9fd98ade714 Mon Sep 17 00:00:00 2001
From: Dipankar Sarma <dipankar@in.ibm.com>
Date: Tue, 7 Mar 2006 21:55:35 -0800
Subject: [PATCH] fix file counting

I have benchmarked this on an x86_64 NUMA system and see no significant
performance difference on kernbench.  Tested on both x86_64 and powerpc.

The way we do file struct accounting is not very suitable for batched
freeing.  For scalability reasons, file accounting was
constructor/destructor based.  This meant that nr_files was decremented
only when the object was removed from the slab cache.  This is susceptible
to slab fragmentation.  With RCU based file structure, consequent batched
freeing and a test program like Serge's, we just speed this up and end up
with a very fragmented slab -

llm22:~ # cat /proc/sys/fs/file-nr
587730  0       758844

At the same time, I see only a 2000+ objects in filp cache.  The following
patch I fixes this problem.

This patch changes the file counting by removing the filp_count_lock.
Instead we use a separate percpu counter, nr_files, for now and all
accesses to it are through get_nr_files() api.  In the sysctl handler for
nr_files, we populate files_stat.nr_files before returning to user.

Counting files as an when they are created and destroyed (as opposed to
inside slab) allows us to correctly count open files with RCU.

Signed-off-by: Dipankar Sarma <dipankar@in.ibm.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dcache.c          |  2 +-
 fs/file_table.c      | 87 +++++++++++++++++++++++++++++++++-------------------
 include/linux/file.h |  2 --
 include/linux/fs.h   |  1 +
 kernel/sysctl.c      |  5 ++-
 net/unix/af_unix.c   |  2 +-
 6 files changed, 62 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/fs/dcache.c b/fs/dcache.c
index a173bba3266..11dc83092d4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1736,7 +1736,7 @@ void __init vfs_caches_init(unsigned long mempages)
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
 	filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
 	dcache_init(mempages);
 	inode_init(mempages);
diff --git a/fs/file_table.c b/fs/file_table.c
index 768b5816754..44fabeaa941 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -5,6 +5,7 @@
  *  Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
  */
 
+#include <linux/config.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/file.h>
@@ -19,52 +20,67 @@
 #include <linux/capability.h>
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
+#include <linux/sysctl.h>
+#include <linux/percpu_counter.h>
+
+#include <asm/atomic.h>
 
 /* sysctl tunables... */
 struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-EXPORT_SYMBOL(files_stat); /* Needed by unix.o */
-
 /* public. Not pretty! */
- __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
 
-static DEFINE_SPINLOCK(filp_count_lock);
+static struct percpu_counter nr_files __cacheline_aligned_in_smp;
 
-/* slab constructors and destructors are called from arbitrary
- * context and must be fully threaded - use a local spinlock
- * to protect files_stat.nr_files
- */
-void filp_ctor(void *objp, struct kmem_cache *cachep, unsigned long cflags)
+static inline void file_free_rcu(struct rcu_head *head)
 {
-	if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
-	    SLAB_CTOR_CONSTRUCTOR) {
-		unsigned long flags;
-		spin_lock_irqsave(&filp_count_lock, flags);
-		files_stat.nr_files++;
-		spin_unlock_irqrestore(&filp_count_lock, flags);
-	}
+	struct file *f =  container_of(head, struct file, f_u.fu_rcuhead);
+	kmem_cache_free(filp_cachep, f);
 }
 
-void filp_dtor(void *objp, struct kmem_cache *cachep, unsigned long dflags)
+static inline void file_free(struct file *f)
 {
-	unsigned long flags;
-	spin_lock_irqsave(&filp_count_lock, flags);
-	files_stat.nr_files--;
-	spin_unlock_irqrestore(&filp_count_lock, flags);
+	percpu_counter_dec(&nr_files);
+	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
 }
 
-static inline void file_free_rcu(struct rcu_head *head)
+/*
+ * Return the total number of open files in the system
+ */
+static int get_nr_files(void)
 {
-	struct file *f =  container_of(head, struct file, f_u.fu_rcuhead);
-	kmem_cache_free(filp_cachep, f);
+	return percpu_counter_read_positive(&nr_files);
 }
 
-static inline void file_free(struct file *f)
+/*
+ * Return the maximum number of open files in the system
+ */
+int get_max_files(void)
 {
-	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
+	return files_stat.max_files;
 }
+EXPORT_SYMBOL_GPL(get_max_files);
+
+/*
+ * Handle nr_files sysctl
+ */
+#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
+int proc_nr_files(ctl_table *table, int write, struct file *filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	files_stat.nr_files = get_nr_files();
+	return proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+#else
+int proc_nr_files(ctl_table *table, int write, struct file *filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+#endif
 
 /* Find an unused file structure and return a pointer to it.
  * Returns NULL, if there are no more free file structures or
@@ -78,14 +94,20 @@ struct file *get_empty_filp(void)
 	/*
 	 * Privileged users can go above max_files
 	 */
-	if (files_stat.nr_files >= files_stat.max_files &&
-				!capable(CAP_SYS_ADMIN))
-		goto over;
+	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
+		/*
+		 * percpu_counters are inaccurate.  Do an expensive check before
+		 * we go and fail.
+		 */
+		if (percpu_counter_sum(&nr_files) >= files_stat.max_files)
+			goto over;
+	}
 
 	f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
 	if (f == NULL)
 		goto fail;
 
+	percpu_counter_inc(&nr_files);
 	memset(f, 0, sizeof(*f));
 	if (security_file_alloc(f))
 		goto fail_sec;
@@ -101,10 +123,10 @@ struct file *get_empty_filp(void)
 
 over:
 	/* Ran out of filps - report that */
-	if (files_stat.nr_files > old_max) {
+	if (get_nr_files() > old_max) {
 		printk(KERN_INFO "VFS: file-max limit %d reached\n",
-					files_stat.max_files);
-		old_max = files_stat.nr_files;
+					get_max_files());
+		old_max = get_nr_files();
 	}
 	goto fail;
 
@@ -276,4 +298,5 @@ void __init files_init(unsigned long mempages)
 	if (files_stat.max_files < NR_FILE)
 		files_stat.max_files = NR_FILE;
 	files_defer_init();
+	percpu_counter_init(&nr_files);
 } 
diff --git a/include/linux/file.h b/include/linux/file.h
index 418b6101b59..9901b850f2e 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -60,8 +60,6 @@ extern void put_filp(struct file *);
 extern int get_unused_fd(void);
 extern void FASTCALL(put_unused_fd(unsigned int fd));
 struct kmem_cache;
-extern void filp_ctor(void * objp, struct kmem_cache *cachep, unsigned long cflags);
-extern void filp_dtor(void * objp, struct kmem_cache *cachep, unsigned long dflags);
 
 extern struct file ** alloc_fd_array(int);
 extern void free_fd_array(struct file **, int);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0cc34b1c42c..51c0c93bdf9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -35,6 +35,7 @@ struct files_stat_struct {
 	int max_files;		/* tunable */
 };
 extern struct files_stat_struct files_stat;
+extern int get_max_files(void);
 
 struct inodes_stat_t {
 	int nr_inodes;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index de2d9109194..32b48e8ee36 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -50,6 +50,9 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 
+extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
+                     void __user *buffer, size_t *lenp, loff_t *ppos);
+
 #if defined(CONFIG_SYSCTL)
 
 /* External variables not in a header file. */
@@ -943,7 +946,7 @@ static ctl_table fs_table[] = {
 		.data		= &files_stat,
 		.maxlen		= 3*sizeof(int),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_nr_files,
 	},
 	{
 		.ctl_name	= FS_MAXFILE,
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 1b5989b1b67..c323cc6a28b 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -547,7 +547,7 @@ static struct sock * unix_create1(struct socket *sock)
 	struct sock *sk = NULL;
 	struct unix_sock *u;
 
-	if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
+	if (atomic_read(&unix_nr_socks) >= 2*get_max_files())
 		goto out;
 
 	sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1);
-- 
cgit v1.2.3-70-g09d2


From 7cd9013be6c22f3ff6f777354f766c8c0b955e17 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sat, 11 Mar 2006 03:27:18 -0800
Subject: [PATCH] remove __put_task_struct_cb export again

The patch '[PATCH] RCU signal handling' [1] added an export for
__put_task_struct_cb, a put_task_struct helper newly introduced in that
patch.  But the put_task_struct couldn't be used modular previously as
__put_task_struct wasn't exported.  There are not callers of it in modular
code, and it shouldn't be exported because we don't want drivers to hold
references to task_structs.

This patch removes the export and folds __put_task_struct into
__put_task_struct_cb as there's no other caller.

[1] http://www2.kernel.org/git/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=e56d090310d7625ecb43a1eeebd479f04affb48b

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Paul E. McKenney <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 1 -
 kernel/fork.c         | 4 +++-
 kernel/sched.c        | 7 -------
 3 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ff2e09c953b..62e6314382f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -892,7 +892,6 @@ static inline int pid_alive(struct task_struct *p)
 }
 
 extern void free_task(struct task_struct *tsk);
-extern void __put_task_struct(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
 
 extern void __put_task_struct_cb(struct rcu_head *rhp);
diff --git a/kernel/fork.c b/kernel/fork.c
index fbea12d7a94..a8eab86de7f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -108,8 +108,10 @@ void free_task(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(free_task);
 
-void __put_task_struct(struct task_struct *tsk)
+void __put_task_struct_cb(struct rcu_head *rhp)
 {
+	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
+
 	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
diff --git a/kernel/sched.c b/kernel/sched.c
index e82c99f1db6..4d46e90f59c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -178,13 +178,6 @@ static unsigned int task_timeslice(task_t *p)
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
 
-void __put_task_struct_cb(struct rcu_head *rhp)
-{
-	__put_task_struct(container_of(rhp, struct task_struct, rcu));
-}
-
-EXPORT_SYMBOL_GPL(__put_task_struct_cb);
-
 /*
  * These are the runqueue data structures:
  */
-- 
cgit v1.2.3-70-g09d2


From f9a3879abf2f1a27c39915e6074b8ff15a24cb55 Mon Sep 17 00:00:00 2001
From: GOTO Masanori <gotom@sanori.org>
Date: Mon, 13 Mar 2006 21:20:44 -0800
Subject: [PATCH] Fix sigaltstack corruption among cloned threads

This patch fixes alternate signal stack corruption among cloned threads
with CLONE_SIGHAND (and CLONE_VM) for linux-2.6.16-rc6.

The value of alternate signal stack is currently inherited after a call of
clone(...  CLONE_SIGHAND | CLONE_VM).  But if sigaltstack is set by a
parent thread, and then if multiple cloned child threads (+ parent threads)
call signal handler at the same time, some threads may be conflicted -
because they share to use the same alternative signal stack region.
Finally they get sigsegv.  It's an undesirable race condition.  Note that
child threads created from NPTL pthread_create() also hit this conflict
when the parent thread uses sigaltstack, without my patch.

To fix this problem, this patch clears the child threads' sigaltstack
information like exec().  This behavior follows the SUSv3 specification.
In SUSv3, pthread_create() says "The alternate stack shall not be inherited
(when new threads are initialized)".  It means that sigaltstack should be
cleared when sigaltstack memory space is shared by cloned threads with
CLONE_SIGHAND.

Note that I chose "if (clone_flags & CLONE_SIGHAND)" line because:
  - If clone_flags line is not existed, fork() does not inherit sigaltstack.
  - CLONE_VM is another choice, but vfork() does not inherit sigaltstack.
  - CLONE_SIGHAND implies CLONE_VM, and it looks suitable.
  - CLONE_THREAD is another candidate, and includes CLONE_SIGHAND + CLONE_VM,
    but this flag has a bit different semantics.
I decided to use CLONE_SIGHAND.

[ Changed to test for CLONE_VM && !CLONE_VFORK after discussion --Linus ]

Signed-off-by: GOTO Masanori <gotom@sanori.org>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Linus Torvalds <torvalds@osdl.org>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index a8eab86de7f..ccdfbb16c86 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1061,6 +1061,12 @@ static task_t *copy_process(unsigned long clone_flags,
 	 */
 	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
 
+	/*
+	 * sigaltstack should be cleared when sharing the same VM
+	 */
+	if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
+		p->sas_ss_sp = p->sas_ss_size = 0;
+
 	/*
 	 * Syscall tracing should be turned off in the child regardless
 	 * of CLONE_PTRACE.
-- 
cgit v1.2.3-70-g09d2