From 1eede070a59e1cc73da51e1aaa00d9ab86572cfc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 20 May 2008 23:00:01 +0200
Subject: Introduce new top level suspend and hibernation callbacks

Introduce 'struct pm_ops' and 'struct pm_ext_ops' ('ext' meaning
'extended') representing suspend and hibernation operations for bus
types, device classes, device types and device drivers.

Modify the PM core to use 'struct pm_ops' and 'struct pm_ext_ops'
objects, if defined, instead of the ->suspend(), ->resume(),
->suspend_late(), and ->resume_early() callbacks (the old callbacks
will be considered as legacy and gradually phased out).

The main purpose of doing this is to separate suspend (aka S2RAM and
standby) callbacks from hibernation callbacks in such a way that the
new callbacks won't take arguments and the semantics of each of them
will be clearly specified.  This has been requested for multiple
times by many people, including Linus himself, and the reason is that
within the current scheme if ->resume() is called, for example, it's
difficult to say why it's been called (ie. is it a resume from RAM or
from hibernation or a suspend/hibernation failure etc.?).

The second purpose is to make the suspend/hibernation callbacks more
flexible so that device drivers can handle more than they can within
the current scheme.  For example, some drivers may need to prevent
new children of the device from being registered before their
->suspend() callbacks are executed or they may want to carry out some
operations requiring the availability of some other devices, not
directly bound via the parent-child relationship, in order to prepare
for the execution of ->suspend(), etc.

Ultimately, we'd like to stop using the freezing of tasks for suspend
and therefore the drivers' suspend/hibernation code will have to take
care of the handling of the user space during suspend/hibernation.
That, in turn, would be difficult within the current scheme, without
the new ->prepare() and ->complete() callbacks.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 kernel/power/disk.c | 22 +++++++++++++++-------
 kernel/power/main.c |  6 ++++--
 2 files changed, 19 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 14a656cdc65..d416be0efa8 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -193,6 +193,7 @@ static int create_image(int platform_mode)
 	if (error)
 		return error;
 
+	device_pm_lock();
 	local_irq_disable();
 	/* At this point, device_suspend() has been called, but *not*
 	 * device_power_down(). We *must* call device_power_down() now.
@@ -224,9 +225,11 @@ static int create_image(int platform_mode)
 	/* NOTE:  device_power_up() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
-	device_power_up();
+	device_power_up(in_suspend ?
+		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
  Enable_irqs:
 	local_irq_enable();
+	device_pm_unlock();
 	return error;
 }
 
@@ -280,7 +283,8 @@ int hibernation_snapshot(int platform_mode)
  Finish:
 	platform_finish(platform_mode);
  Resume_devices:
-	device_resume();
+	device_resume(in_suspend ?
+		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
  Resume_console:
 	resume_console();
  Close:
@@ -300,8 +304,9 @@ static int resume_target_kernel(void)
 {
 	int error;
 
+	device_pm_lock();
 	local_irq_disable();
-	error = device_power_down(PMSG_PRETHAW);
+	error = device_power_down(PMSG_QUIESCE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting resume\n");
@@ -329,9 +334,10 @@ static int resume_target_kernel(void)
 	swsusp_free();
 	restore_processor_state();
 	touch_softlockup_watchdog();
-	device_power_up();
+	device_power_up(PMSG_RECOVER);
  Enable_irqs:
 	local_irq_enable();
+	device_pm_unlock();
 	return error;
 }
 
@@ -350,7 +356,7 @@ int hibernation_restore(int platform_mode)
 
 	pm_prepare_console();
 	suspend_console();
-	error = device_suspend(PMSG_PRETHAW);
+	error = device_suspend(PMSG_QUIESCE);
 	if (error)
 		goto Finish;
 
@@ -362,7 +368,7 @@ int hibernation_restore(int platform_mode)
 		enable_nonboot_cpus();
 	}
 	platform_restore_cleanup(platform_mode);
-	device_resume();
+	device_resume(PMSG_RECOVER);
  Finish:
 	resume_console();
 	pm_restore_console();
@@ -403,6 +409,7 @@ int hibernation_platform_enter(void)
 	if (error)
 		goto Finish;
 
+	device_pm_lock();
 	local_irq_disable();
 	error = device_power_down(PMSG_HIBERNATE);
 	if (!error) {
@@ -411,6 +418,7 @@ int hibernation_platform_enter(void)
 		while (1);
 	}
 	local_irq_enable();
+	device_pm_unlock();
 
 	/*
 	 * We don't need to reenable the nonboot CPUs or resume consoles, since
@@ -419,7 +427,7 @@ int hibernation_platform_enter(void)
  Finish:
 	hibernation_ops->finish();
  Resume_devices:
-	device_resume();
+	device_resume(PMSG_RESTORE);
  Resume_console:
 	resume_console();
  Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6a6d5eb3524..d023b6b584e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -228,6 +228,7 @@ static int suspend_enter(suspend_state_t state)
 {
 	int error = 0;
 
+	device_pm_lock();
 	arch_suspend_disable_irqs();
 	BUG_ON(!irqs_disabled());
 
@@ -239,10 +240,11 @@ static int suspend_enter(suspend_state_t state)
 	if (!suspend_test(TEST_CORE))
 		error = suspend_ops->enter(state);
 
-	device_power_up();
+	device_power_up(PMSG_RESUME);
  Done:
 	arch_suspend_enable_irqs();
 	BUG_ON(irqs_disabled());
+	device_pm_unlock();
 	return error;
 }
 
@@ -291,7 +293,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 	if (suspend_ops->finish)
 		suspend_ops->finish();
  Resume_devices:
-	device_resume();
+	device_resume(PMSG_RESUME);
  Resume_console:
 	resume_console();
  Close:
-- 
cgit v1.2.3-70-g09d2


From d8f3de0d2412bb91639cfefc5b3c79dbf3812212 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 12 Jun 2008 23:24:06 +0200
Subject: Suspend-related patches for 2.6.27

ACPI PM: Add possibility to change suspend sequence

There are some systems out there that don't work correctly with
our current suspend/hibernation code ordering.  Provide a workaround
for these systems allowing them to pass 'acpi_sleep=old_ordering' in
the kernel command line so that it will use the pre-ACPI 2.0 ("old")
suspend code ordering.

Unfortunately, this requires us to add a platform hook to the
resuming of devices for recovering the platform in case one of the
device drivers' .suspend() routines returns error code.  Namely,
ACPI 1.0 specifies that _PTS should be called before suspending
devices, but _WAK still should be called before resuming them in
order to undo the changes made by _PTS.  However, if there is an
error during suspending devices, they are automatically resumed
without returning control to the PM core, so the _WAK has to be
called from within device_resume() in that cases.

The patch also reorders and refactors the ACPI suspend/hibernation
code to avoid duplication as far as reasonably possible.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/kernel-parameters.txt |   6 +-
 arch/x86/kernel/acpi/sleep.c        |   2 +
 drivers/acpi/sleep/main.c           | 276 +++++++++++++++++++++---------------
 drivers/base/power/main.c           |   2 -
 include/linux/acpi.h                |   3 +
 include/linux/suspend.h             |  14 +-
 kernel/power/disk.c                 |  28 +++-
 kernel/power/main.c                 |  10 +-
 8 files changed, 215 insertions(+), 126 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9cf7b34f2db..18d793ea0dd 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -147,10 +147,14 @@ and is between 256 and 4096 characters. It is defined in the file
 			default: 0
 
 	acpi_sleep=	[HW,ACPI] Sleep options
-			Format: { s3_bios, s3_mode, s3_beep }
+			Format: { s3_bios, s3_mode, s3_beep, old_ordering }
 			See Documentation/power/video.txt for s3_bios and s3_mode.
 			s3_beep is for debugging; it makes the PC's speaker beep
 			as soon as the kernel's real-mode entry point is called.
+			old_ordering causes the ACPI 1.0 ordering of the _PTS
+			control method, wrt putting devices into low power
+			states, to be enforced (the ACPI 2.0 ordering of _PTS is
+			used by default).
 
 	acpi_sci=	[HW,ACPI] ACPI System Control Interrupt trigger mode
 			Format: { level | edge | high | low }
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index afc25ee9964..882e970032d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -124,6 +124,8 @@ static int __init acpi_sleep_setup(char *str)
 			acpi_realmode_flags |= 2;
 		if (strncmp(str, "s3_beep", 7) == 0)
 			acpi_realmode_flags |= 4;
+		if (strncmp(str, "old_ordering", 12) == 0)
+			acpi_old_suspend_ordering();
 		str = strchr(str, ',');
 		if (str != NULL)
 			str += strspn(str, ", \t");
diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c
index 0f2caea2fc8..4addf8ad50a 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/sleep/main.c
@@ -24,10 +24,6 @@
 
 u8 sleep_states[ACPI_S_STATE_COUNT];
 
-#ifdef CONFIG_PM_SLEEP
-static u32 acpi_target_sleep_state = ACPI_STATE_S0;
-#endif
-
 static int acpi_sleep_prepare(u32 acpi_state)
 {
 #ifdef CONFIG_ACPI_SLEEP
@@ -50,9 +46,96 @@ static int acpi_sleep_prepare(u32 acpi_state)
 	return 0;
 }
 
-#ifdef CONFIG_SUSPEND
-static struct platform_suspend_ops acpi_suspend_ops;
+#ifdef CONFIG_PM_SLEEP
+static u32 acpi_target_sleep_state = ACPI_STATE_S0;
+
+/*
+ * ACPI 1.0 wants us to execute _PTS before suspending devices, so we allow the
+ * user to request that behavior by using the 'acpi_old_suspend_ordering'
+ * kernel command line option that causes the following variable to be set.
+ */
+static bool old_suspend_ordering;
+
+void __init acpi_old_suspend_ordering(void)
+{
+	old_suspend_ordering = true;
+}
+
+/**
+ *	acpi_pm_disable_gpes - Disable the GPEs.
+ */
+static int acpi_pm_disable_gpes(void)
+{
+	acpi_hw_disable_all_gpes();
+	return 0;
+}
+
+/**
+ *	__acpi_pm_prepare - Prepare the platform to enter the target state.
+ *
+ *	If necessary, set the firmware waking vector and do arch-specific
+ *	nastiness to get the wakeup code to the waking vector.
+ */
+static int __acpi_pm_prepare(void)
+{
+	int error = acpi_sleep_prepare(acpi_target_sleep_state);
+
+	if (error)
+		acpi_target_sleep_state = ACPI_STATE_S0;
+	return error;
+}
+
+/**
+ *	acpi_pm_prepare - Prepare the platform to enter the target sleep
+ *		state and disable the GPEs.
+ */
+static int acpi_pm_prepare(void)
+{
+	int error = __acpi_pm_prepare();
+
+	if (!error)
+		acpi_hw_disable_all_gpes();
+	return error;
+}
+
+/**
+ *	acpi_pm_finish - Instruct the platform to leave a sleep state.
+ *
+ *	This is called after we wake back up (or if entering the sleep state
+ *	failed).
+ */
+static void acpi_pm_finish(void)
+{
+	u32 acpi_state = acpi_target_sleep_state;
+
+	if (acpi_state == ACPI_STATE_S0)
+		return;
 
+	printk(KERN_INFO PREFIX "Waking up from system sleep state S%d\n",
+		acpi_state);
+	acpi_disable_wakeup_device(acpi_state);
+	acpi_leave_sleep_state(acpi_state);
+
+	/* reset firmware waking vector */
+	acpi_set_firmware_waking_vector((acpi_physical_address) 0);
+
+	acpi_target_sleep_state = ACPI_STATE_S0;
+}
+
+/**
+ *	acpi_pm_end - Finish up suspend sequence.
+ */
+static void acpi_pm_end(void)
+{
+	/*
+	 * This is necessary in case acpi_pm_finish() is not called during a
+	 * failing transition to a sleep state.
+	 */
+	acpi_target_sleep_state = ACPI_STATE_S0;
+}
+#endif /* CONFIG_PM_SLEEP */
+
+#ifdef CONFIG_SUSPEND
 extern void do_suspend_lowlevel(void);
 
 static u32 acpi_suspend_states[] = {
@@ -66,7 +149,6 @@ static u32 acpi_suspend_states[] = {
  *	acpi_suspend_begin - Set the target system sleep state to the state
  *		associated with given @pm_state, if supported.
  */
-
 static int acpi_suspend_begin(suspend_state_t pm_state)
 {
 	u32 acpi_state = acpi_suspend_states[pm_state];
@@ -82,25 +164,6 @@ static int acpi_suspend_begin(suspend_state_t pm_state)
 	return error;
 }
 
-/**
- *	acpi_suspend_prepare - Do preliminary suspend work.
- *
- *	If necessary, set the firmware waking vector and do arch-specific
- *	nastiness to get the wakeup code to the waking vector.
- */
-
-static int acpi_suspend_prepare(void)
-{
-	int error = acpi_sleep_prepare(acpi_target_sleep_state);
-
-	if (error) {
-		acpi_target_sleep_state = ACPI_STATE_S0;
-		return error;
-	}
-
-	return ACPI_SUCCESS(acpi_hw_disable_all_gpes()) ? 0 : -EFAULT;
-}
-
 /**
  *	acpi_suspend_enter - Actually enter a sleep state.
  *	@pm_state: ignored
@@ -109,7 +172,6 @@ static int acpi_suspend_prepare(void)
  *	assembly, which in turn call acpi_enter_sleep_state().
  *	It's unfortunate, but it works. Please fix if you're feeling frisky.
  */
-
 static int acpi_suspend_enter(suspend_state_t pm_state)
 {
 	acpi_status status = AE_OK;
@@ -166,39 +228,6 @@ static int acpi_suspend_enter(suspend_state_t pm_state)
 	return ACPI_SUCCESS(status) ? 0 : -EFAULT;
 }
 
-/**
- *	acpi_suspend_finish - Instruct the platform to leave a sleep state.
- *
- *	This is called after we wake back up (or if entering the sleep state
- *	failed). 
- */
-
-static void acpi_suspend_finish(void)
-{
-	u32 acpi_state = acpi_target_sleep_state;
-
-	acpi_disable_wakeup_device(acpi_state);
-	acpi_leave_sleep_state(acpi_state);
-
-	/* reset firmware waking vector */
-	acpi_set_firmware_waking_vector((acpi_physical_address) 0);
-
-	acpi_target_sleep_state = ACPI_STATE_S0;
-}
-
-/**
- *	acpi_suspend_end - Finish up suspend sequence.
- */
-
-static void acpi_suspend_end(void)
-{
-	/*
-	 * This is necessary in case acpi_suspend_finish() is not called during a
-	 * failing transition to a sleep state.
-	 */
-	acpi_target_sleep_state = ACPI_STATE_S0;
-}
-
 static int acpi_suspend_state_valid(suspend_state_t pm_state)
 {
 	u32 acpi_state;
@@ -218,10 +247,39 @@ static int acpi_suspend_state_valid(suspend_state_t pm_state)
 static struct platform_suspend_ops acpi_suspend_ops = {
 	.valid = acpi_suspend_state_valid,
 	.begin = acpi_suspend_begin,
-	.prepare = acpi_suspend_prepare,
+	.prepare = acpi_pm_prepare,
+	.enter = acpi_suspend_enter,
+	.finish = acpi_pm_finish,
+	.end = acpi_pm_end,
+};
+
+/**
+ *	acpi_suspend_begin_old - Set the target system sleep state to the
+ *		state associated with given @pm_state, if supported, and
+ *		execute the _PTS control method.  This function is used if the
+ *		pre-ACPI 2.0 suspend ordering has been requested.
+ */
+static int acpi_suspend_begin_old(suspend_state_t pm_state)
+{
+	int error = acpi_suspend_begin(pm_state);
+
+	if (!error)
+		error = __acpi_pm_prepare();
+	return error;
+}
+
+/*
+ * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has
+ * been requested.
+ */
+static struct platform_suspend_ops acpi_suspend_ops_old = {
+	.valid = acpi_suspend_state_valid,
+	.begin = acpi_suspend_begin_old,
+	.prepare = acpi_pm_disable_gpes,
 	.enter = acpi_suspend_enter,
-	.finish = acpi_suspend_finish,
-	.end = acpi_suspend_end,
+	.finish = acpi_pm_finish,
+	.end = acpi_pm_end,
+	.recover = acpi_pm_finish,
 };
 #endif /* CONFIG_SUSPEND */
 
@@ -229,22 +287,9 @@ static struct platform_suspend_ops acpi_suspend_ops = {
 static int acpi_hibernation_begin(void)
 {
 	acpi_target_sleep_state = ACPI_STATE_S4;
-
 	return 0;
 }
 
-static int acpi_hibernation_prepare(void)
-{
-	int error = acpi_sleep_prepare(ACPI_STATE_S4);
-
-	if (error) {
-		acpi_target_sleep_state = ACPI_STATE_S0;
-		return error;
-	}
-
-	return ACPI_SUCCESS(acpi_hw_disable_all_gpes()) ? 0 : -EFAULT;
-}
-
 static int acpi_hibernation_enter(void)
 {
 	acpi_status status = AE_OK;
@@ -274,52 +319,55 @@ static void acpi_hibernation_leave(void)
 	acpi_leave_sleep_state_prep(ACPI_STATE_S4);
 }
 
-static void acpi_hibernation_finish(void)
+static void acpi_pm_enable_gpes(void)
 {
-	acpi_disable_wakeup_device(ACPI_STATE_S4);
-	acpi_leave_sleep_state(ACPI_STATE_S4);
-
-	/* reset firmware waking vector */
-	acpi_set_firmware_waking_vector((acpi_physical_address) 0);
-
-	acpi_target_sleep_state = ACPI_STATE_S0;
+	acpi_hw_enable_all_runtime_gpes();
 }
 
-static void acpi_hibernation_end(void)
-{
-	/*
-	 * This is necessary in case acpi_hibernation_finish() is not called
-	 * during a failing transition to the sleep state.
-	 */
-	acpi_target_sleep_state = ACPI_STATE_S0;
-}
+static struct platform_hibernation_ops acpi_hibernation_ops = {
+	.begin = acpi_hibernation_begin,
+	.end = acpi_pm_end,
+	.pre_snapshot = acpi_pm_prepare,
+	.finish = acpi_pm_finish,
+	.prepare = acpi_pm_prepare,
+	.enter = acpi_hibernation_enter,
+	.leave = acpi_hibernation_leave,
+	.pre_restore = acpi_pm_disable_gpes,
+	.restore_cleanup = acpi_pm_enable_gpes,
+};
 
-static int acpi_hibernation_pre_restore(void)
+/**
+ *	acpi_hibernation_begin_old - Set the target system sleep state to
+ *		ACPI_STATE_S4 and execute the _PTS control method.  This
+ *		function is used if the pre-ACPI 2.0 suspend ordering has been
+ *		requested.
+ */
+static int acpi_hibernation_begin_old(void)
 {
-	acpi_status status;
-
-	status = acpi_hw_disable_all_gpes();
-
-	return ACPI_SUCCESS(status) ? 0 : -EFAULT;
-}
+	int error = acpi_sleep_prepare(ACPI_STATE_S4);
 
-static void acpi_hibernation_restore_cleanup(void)
-{
-	acpi_hw_enable_all_runtime_gpes();
+	if (!error)
+		acpi_target_sleep_state = ACPI_STATE_S4;
+	return error;
 }
 
-static struct platform_hibernation_ops acpi_hibernation_ops = {
-	.begin = acpi_hibernation_begin,
-	.end = acpi_hibernation_end,
-	.pre_snapshot = acpi_hibernation_prepare,
-	.finish = acpi_hibernation_finish,
-	.prepare = acpi_hibernation_prepare,
+/*
+ * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has
+ * been requested.
+ */
+static struct platform_hibernation_ops acpi_hibernation_ops_old = {
+	.begin = acpi_hibernation_begin_old,
+	.end = acpi_pm_end,
+	.pre_snapshot = acpi_pm_disable_gpes,
+	.finish = acpi_pm_finish,
+	.prepare = acpi_pm_disable_gpes,
 	.enter = acpi_hibernation_enter,
 	.leave = acpi_hibernation_leave,
-	.pre_restore = acpi_hibernation_pre_restore,
-	.restore_cleanup = acpi_hibernation_restore_cleanup,
+	.pre_restore = acpi_pm_disable_gpes,
+	.restore_cleanup = acpi_pm_enable_gpes,
+	.recover = acpi_pm_finish,
 };
-#endif				/* CONFIG_HIBERNATION */
+#endif /* CONFIG_HIBERNATION */
 
 int acpi_suspend(u32 acpi_state)
 {
@@ -461,13 +509,15 @@ int __init acpi_sleep_init(void)
 		}
 	}
 
-	suspend_set_ops(&acpi_suspend_ops);
+	suspend_set_ops(old_suspend_ordering ?
+		&acpi_suspend_ops_old : &acpi_suspend_ops);
 #endif
 
 #ifdef CONFIG_HIBERNATION
 	status = acpi_get_sleep_type_data(ACPI_STATE_S4, &type_a, &type_b);
 	if (ACPI_SUCCESS(status)) {
-		hibernation_set_ops(&acpi_hibernation_ops);
+		hibernation_set_ops(old_suspend_ordering ?
+			&acpi_hibernation_ops_old : &acpi_hibernation_ops);
 		sleep_states[ACPI_STATE_S4] = 1;
 		printk(" S4");
 	}
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index d571204aaff..3250c5257b7 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -781,8 +781,6 @@ int device_suspend(pm_message_t state)
 	error = dpm_prepare(state);
 	if (!error)
 		error = dpm_suspend(state);
-	if (error)
-		device_resume(resume_event(state));
 	return error;
 }
 EXPORT_SYMBOL_GPL(device_suspend);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 41f7ce7edd7..33adcf91ef4 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -234,6 +234,9 @@ int acpi_check_region(resource_size_t start, resource_size_t n,
 int acpi_check_mem_region(resource_size_t start, resource_size_t n,
 		      const char *name);
 
+#ifdef CONFIG_PM_SLEEP
+void __init acpi_old_suspend_ordering(void);
+#endif /* CONFIG_PM_SLEEP */
 #else	/* CONFIG_ACPI */
 
 static inline int early_acpi_boot_init(void)
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index a6977423baf..e8e69159af7 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -86,6 +86,11 @@ typedef int __bitwise suspend_state_t;
  *	that implement @begin(), but platforms implementing @begin() should
  *	also provide a @end() which cleans up transitions aborted before
  *	@enter().
+ *
+ * @recover: Recover the platform from a suspend failure.
+ *	Called by the PM core if the suspending of devices fails.
+ *	This callback is optional and should only be implemented by platforms
+ *	which require special recovery actions in that situation.
  */
 struct platform_suspend_ops {
 	int (*valid)(suspend_state_t state);
@@ -94,6 +99,7 @@ struct platform_suspend_ops {
 	int (*enter)(suspend_state_t state);
 	void (*finish)(void);
 	void (*end)(void);
+	void (*recover)(void);
 };
 
 #ifdef CONFIG_SUSPEND
@@ -149,7 +155,7 @@ extern void mark_free_pages(struct zone *zone);
  * The methods in this structure allow a platform to carry out special
  * operations required by it during a hibernation transition.
  *
- * All the methods below must be implemented.
+ * All the methods below, except for @recover(), must be implemented.
  *
  * @begin: Tell the platform driver that we're starting hibernation.
  *	Called right after shrinking memory and before freezing devices.
@@ -189,6 +195,11 @@ extern void mark_free_pages(struct zone *zone);
  * @restore_cleanup: Clean up after a failing image restoration.
  *	Called right after the nonboot CPUs have been enabled and before
  *	thawing devices (runs with IRQs on).
+ *
+ * @recover: Recover the platform from a failure to suspend devices.
+ *	Called by the PM core if the suspending of devices during hibernation
+ *	fails.  This callback is optional and should only be implemented by
+ *	platforms which require special recovery actions in that situation.
  */
 struct platform_hibernation_ops {
 	int (*begin)(void);
@@ -200,6 +211,7 @@ struct platform_hibernation_ops {
 	void (*leave)(void);
 	int (*pre_restore)(void);
 	void (*restore_cleanup)(void);
+	void (*recover)(void);
 };
 
 #ifdef CONFIG_HIBERNATION
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index d416be0efa8..f011e0870b5 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -179,6 +179,17 @@ static void platform_restore_cleanup(int platform_mode)
 		hibernation_ops->restore_cleanup();
 }
 
+/**
+ *	platform_recover - recover the platform from a failure to suspend
+ *	devices.
+ */
+
+static void platform_recover(int platform_mode)
+{
+	if (platform_mode && hibernation_ops && hibernation_ops->recover)
+		hibernation_ops->recover();
+}
+
 /**
  *	create_image - freeze devices that need to be frozen with interrupts
  *	off, create the hibernation image and thaw those devices.  Control
@@ -258,10 +269,10 @@ int hibernation_snapshot(int platform_mode)
 	suspend_console();
 	error = device_suspend(PMSG_FREEZE);
 	if (error)
-		goto Resume_console;
+		goto Recover_platform;
 
 	if (hibernation_test(TEST_DEVICES))
-		goto Resume_devices;
+		goto Recover_platform;
 
 	error = platform_pre_snapshot(platform_mode);
 	if (error || hibernation_test(TEST_PLATFORM))
@@ -285,11 +296,14 @@ int hibernation_snapshot(int platform_mode)
  Resume_devices:
 	device_resume(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
- Resume_console:
 	resume_console();
  Close:
 	platform_end(platform_mode);
 	return error;
+
+ Recover_platform:
+	platform_recover(platform_mode);
+	goto Resume_devices;
 }
 
 /**
@@ -398,8 +412,11 @@ int hibernation_platform_enter(void)
 
 	suspend_console();
 	error = device_suspend(PMSG_HIBERNATE);
-	if (error)
-		goto Resume_console;
+	if (error) {
+		if (hibernation_ops->recover)
+			hibernation_ops->recover();
+		goto Resume_devices;
+	}
 
 	error = hibernation_ops->prepare();
 	if (error)
@@ -428,7 +445,6 @@ int hibernation_platform_enter(void)
 	hibernation_ops->finish();
  Resume_devices:
 	device_resume(PMSG_RESTORE);
- Resume_console:
 	resume_console();
  Close:
 	hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d023b6b584e..3398f4651aa 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -269,11 +269,11 @@ int suspend_devices_and_enter(suspend_state_t state)
 	error = device_suspend(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to suspend\n");
-		goto Resume_console;
+		goto Recover_platform;
 	}
 
 	if (suspend_test(TEST_DEVICES))
-		goto Resume_devices;
+		goto Recover_platform;
 
 	if (suspend_ops->prepare) {
 		error = suspend_ops->prepare();
@@ -294,12 +294,16 @@ int suspend_devices_and_enter(suspend_state_t state)
 		suspend_ops->finish();
  Resume_devices:
 	device_resume(PMSG_RESUME);
- Resume_console:
 	resume_console();
  Close:
 	if (suspend_ops->end)
 		suspend_ops->end();
 	return error;
+
+ Recover_platform:
+	if (suspend_ops->recover)
+		suspend_ops->recover();
+	goto Resume_devices;
 }
 
 /**
-- 
cgit v1.2.3-70-g09d2


From b62b8ef906cdf7115af579ce7378886ce3e0ce00 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 29 Apr 2008 02:35:56 -0400
Subject: force offline the processor during hot-removal

The ACPI device node for the cpu has already been unregistered
when acpi_processor_handle_eject is called.
Thus we should offline the cpu and continue, rather than a failure here.
http://bugzilla.kernel.org/show_bug.cgi?id=9772

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 drivers/acpi/processor_core.c | 6 +++---
 kernel/cpu.c                  | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 9dd0fa93b9e..1c0008edccd 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -1014,9 +1014,9 @@ static acpi_status acpi_processor_hotadd_init(acpi_handle handle, int *p_cpu)
 
 static int acpi_processor_handle_eject(struct acpi_processor *pr)
 {
-	if (cpu_online(pr->id)) {
-		return (-EINVAL);
-	}
+	if (cpu_online(pr->id))
+		cpu_down(pr->id);
+
 	arch_unregister_cpu(pr->id);
 	acpi_unmap_lsapic(pr->id);
 	return (0);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b11f06dc149..cfb1d43ab80 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -299,6 +299,7 @@ int __ref cpu_down(unsigned int cpu)
 	cpu_maps_update_done();
 	return err;
 }
+EXPORT_SYMBOL(cpu_down);
 #endif /*CONFIG_HOTPLUG_CPU*/
 
 /* Requires cpu_add_remove_lock to be held */
-- 
cgit v1.2.3-70-g09d2


From ebb12db51f6c13b30752fcf506baad4c617b153c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 11 Jun 2008 22:04:29 +0200
Subject: Freezer: Introduce PF_FREEZER_NOSIG

The freezer currently attempts to distinguish kernel threads from
user space tasks by checking if their mm pointer is unset and it
does not send fake signals to kernel threads.  However, there are
kernel threads, mostly related to networking, that behave like
user space tasks and may want to be sent a fake signal to be frozen.

Introduce the new process flag PF_FREEZER_NOSIG that will be set
by default for all kernel threads and make the freezer only send
fake signals to the tasks having PF_FREEZER_NOSIG unset.  Provide
the set_freezable_with_signal() function to be called by the kernel
threads that want to be sent a fake signal for freezing.

This patch should not change the freezer's observable behavior.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/linux/freezer.h | 10 +++++
 include/linux/sched.h   |  1 +
 kernel/kthread.c        |  2 +-
 kernel/power/process.c  | 97 +++++++++++++++++++++----------------------------
 4 files changed, 54 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 08934995c7a..deddeedf325 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -127,6 +127,15 @@ static inline void set_freezable(void)
 	current->flags &= ~PF_NOFREEZE;
 }
 
+/*
+ * Tell the freezer that the current task should be frozen by it and that it
+ * should send a fake signal to the task to freeze it.
+ */
+static inline void set_freezable_with_signal(void)
+{
+	current->flags &= ~(PF_NOFREEZE | PF_FREEZER_NOSIG);
+}
+
 /*
  * Freezer-friendly wrappers around wait_event_interruptible() and
  * wait_event_interruptible_timeout(), originally defined in <linux/wait.h>
@@ -174,6 +183,7 @@ static inline void freezer_do_not_count(void) {}
 static inline void freezer_count(void) {}
 static inline int freezer_should_skip(struct task_struct *p) { return 0; }
 static inline void set_freezable(void) {}
+static inline void set_freezable_with_signal(void) {}
 
 #define wait_event_freezable(wq, condition)				\
 		wait_event_interruptible(wq, condition)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 21349173d14..ba2f859c6e4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1494,6 +1494,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
 #define PF_FREEZER_SKIP	0x40000000	/* Freezer should not count it as freezeable */
+#define PF_FREEZER_NOSIG 0x80000000	/* Freezer won't send signals to it */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 97747cdd37c..ac3fb732664 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -235,7 +235,7 @@ int kthreadd(void *unused)
 	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
 	set_cpus_allowed(tsk, CPU_MASK_ALL);
 
-	current->flags |= PF_NOFREEZE;
+	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
 
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index f1d0b345c9b..5fb87652f21 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -19,9 +19,6 @@
  */
 #define TIMEOUT	(20 * HZ)
 
-#define FREEZER_KERNEL_THREADS 0
-#define FREEZER_USER_SPACE 1
-
 static inline int freezeable(struct task_struct * p)
 {
 	if ((p == current) ||
@@ -84,63 +81,53 @@ static void fake_signal_wake_up(struct task_struct *p)
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
 }
 
-static int has_mm(struct task_struct *p)
+static inline bool should_send_signal(struct task_struct *p)
 {
-	return (p->mm && !(p->flags & PF_BORROWED_MM));
+	return !(p->flags & PF_FREEZER_NOSIG);
 }
 
 /**
  *	freeze_task - send a freeze request to given task
  *	@p: task to send the request to
- *	@with_mm_only: if set, the request will only be sent if the task has its
- *		own mm
- *	Return value: 0, if @with_mm_only is set and the task has no mm of its
- *		own or the task is frozen, 1, otherwise
+ *	@sig_only: if set, the request will only be sent if the task has the
+ *		PF_FREEZER_NOSIG flag unset
+ *	Return value: 'false', if @sig_only is set and the task has
+ *		PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
  *
- *	The freeze request is sent by seting the tasks's TIF_FREEZE flag and
+ *	The freeze request is sent by setting the tasks's TIF_FREEZE flag and
  *	either sending a fake signal to it or waking it up, depending on whether
- *	or not it has its own mm (ie. it is a user land task).  If @with_mm_only
- *	is set and the task has no mm of its own (ie. it is a kernel thread),
- *	its TIF_FREEZE flag should not be set.
- *
- *	The task_lock() is necessary to prevent races with exit_mm() or
- *	use_mm()/unuse_mm() from occuring.
+ *	or not it has PF_FREEZER_NOSIG set.  If @sig_only is set and the task
+ *	has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
+ *	TIF_FREEZE flag will not be set.
  */
-static int freeze_task(struct task_struct *p, int with_mm_only)
+static bool freeze_task(struct task_struct *p, bool sig_only)
 {
-	int ret = 1;
+	/*
+	 * We first check if the task is freezing and next if it has already
+	 * been frozen to avoid the race with frozen_process() which first marks
+	 * the task as frozen and next clears its TIF_FREEZE.
+	 */
+	if (!freezing(p)) {
+		rmb();
+		if (frozen(p))
+			return false;
 
-	task_lock(p);
-	if (freezing(p)) {
-		if (has_mm(p)) {
-			if (!signal_pending(p))
-				fake_signal_wake_up(p);
-		} else {
-			if (with_mm_only)
-				ret = 0;
-			else
-				wake_up_state(p, TASK_INTERRUPTIBLE);
-		}
+		if (!sig_only || should_send_signal(p))
+			set_freeze_flag(p);
+		else
+			return false;
+	}
+
+	if (should_send_signal(p)) {
+		if (!signal_pending(p))
+			fake_signal_wake_up(p);
+	} else if (sig_only) {
+		return false;
 	} else {
-		rmb();
-		if (frozen(p)) {
-			ret = 0;
-		} else {
-			if (has_mm(p)) {
-				set_freeze_flag(p);
-				fake_signal_wake_up(p);
-			} else {
-				if (with_mm_only) {
-					ret = 0;
-				} else {
-					set_freeze_flag(p);
-					wake_up_state(p, TASK_INTERRUPTIBLE);
-				}
-			}
-		}
+		wake_up_state(p, TASK_INTERRUPTIBLE);
 	}
-	task_unlock(p);
-	return ret;
+
+	return true;
 }
 
 static void cancel_freezing(struct task_struct *p)
@@ -156,7 +143,7 @@ static void cancel_freezing(struct task_struct *p)
 	}
 }
 
-static int try_to_freeze_tasks(int freeze_user_space)
+static int try_to_freeze_tasks(bool sig_only)
 {
 	struct task_struct *g, *p;
 	unsigned long end_time;
@@ -175,7 +162,7 @@ static int try_to_freeze_tasks(int freeze_user_space)
 			if (frozen(p) || !freezeable(p))
 				continue;
 
-			if (!freeze_task(p, freeze_user_space))
+			if (!freeze_task(p, sig_only))
 				continue;
 
 			/*
@@ -235,13 +222,13 @@ int freeze_processes(void)
 	int error;
 
 	printk("Freezing user space processes ... ");
-	error = try_to_freeze_tasks(FREEZER_USER_SPACE);
+	error = try_to_freeze_tasks(true);
 	if (error)
 		goto Exit;
 	printk("done.\n");
 
 	printk("Freezing remaining freezable tasks ... ");
-	error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS);
+	error = try_to_freeze_tasks(false);
 	if (error)
 		goto Exit;
 	printk("done.");
@@ -251,7 +238,7 @@ int freeze_processes(void)
 	return error;
 }
 
-static void thaw_tasks(int thaw_user_space)
+static void thaw_tasks(bool nosig_only)
 {
 	struct task_struct *g, *p;
 
@@ -260,7 +247,7 @@ static void thaw_tasks(int thaw_user_space)
 		if (!freezeable(p))
 			continue;
 
-		if (!p->mm == thaw_user_space)
+		if (nosig_only && should_send_signal(p))
 			continue;
 
 		thaw_process(p);
@@ -271,8 +258,8 @@ static void thaw_tasks(int thaw_user_space)
 void thaw_processes(void)
 {
 	printk("Restarting tasks ... ");
-	thaw_tasks(FREEZER_KERNEL_THREADS);
-	thaw_tasks(FREEZER_USER_SPACE);
+	thaw_tasks(true);
+	thaw_tasks(false);
 	schedule();
 	printk("done.\n");
 }
-- 
cgit v1.2.3-70-g09d2


From 52d11025dba32bed696eaee1822b26529e764770 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@redhat.com>
Date: Wed, 11 Jun 2008 22:07:52 +0200
Subject: snapshot: Push BKL down into ioctl handlers

Push BKL down into ioctl handlers - snapshot device.

Signed-off-by: Alan Cox <alan@redhat.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 kernel/power/user.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index f5512cb3aa8..658262b1599 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,6 +23,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <linux/smp_lock.h>
 
 #include <asm/uaccess.h>
 
@@ -164,8 +165,8 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
 	return res;
 }
 
-static int snapshot_ioctl(struct inode *inode, struct file *filp,
-                          unsigned int cmd, unsigned long arg)
+static long snapshot_ioctl(struct file *filp, unsigned int cmd,
+							unsigned long arg)
 {
 	int error = 0;
 	struct snapshot_data *data;
@@ -181,6 +182,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 
 	data = filp->private_data;
 
+	lock_kernel();
+
 	switch (cmd) {
 
 	case SNAPSHOT_FREEZE:
@@ -389,7 +392,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
 		error = -ENOTTY;
 
 	}
-
+	unlock_kernel();
 	return error;
 }
 
@@ -399,7 +402,7 @@ static const struct file_operations snapshot_fops = {
 	.read = snapshot_read,
 	.write = snapshot_write,
 	.llseek = no_llseek,
-	.ioctl = snapshot_ioctl,
+	.unlocked_ioctl = snapshot_ioctl,
 };
 
 static struct miscdevice snapshot_device = {
-- 
cgit v1.2.3-70-g09d2


From 25f2f3daadaf0768a61d02ee3ed3d9a21e9dc46c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 11 Jun 2008 22:09:45 +0200
Subject: snapshot: Use pm_mutex for mutual exclusion

We can avoid taking the BKL in snapshot_ioctl() if pm_mutex is used to prevent
the ioctls from being executed concurrently.

In addition, although it is only possible to open /dev/snapshot once, the task
which has done that may spawn a child that will inherit the open descriptor,
so in theory they can call snapshot_write(), snapshot_read() and
snapshot_release() concurrently.  pm_mutex can also be used for mutual
exclusion in such cases.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 kernel/power/user.c | 68 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 42 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index 658262b1599..a6332a31326 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -70,16 +70,22 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 	struct snapshot_data *data;
 	int error;
 
-	if (!atomic_add_unless(&snapshot_device_available, -1, 0))
-		return -EBUSY;
+	mutex_lock(&pm_mutex);
+
+	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+		error = -EBUSY;
+		goto Unlock;
+	}
 
 	if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
 		atomic_inc(&snapshot_device_available);
-		return -ENOSYS;
+		error = -ENOSYS;
+		goto Unlock;
 	}
 	if(create_basic_memory_bitmaps()) {
 		atomic_inc(&snapshot_device_available);
-		return -ENOMEM;
+		error = -ENOMEM;
+		goto Unlock;
 	}
 	nonseekable_open(inode, filp);
 	data = &snapshot_state;
@@ -99,33 +105,36 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 		if (error)
 			pm_notifier_call_chain(PM_POST_HIBERNATION);
 	}
-	if (error) {
+	if (error)
 		atomic_inc(&snapshot_device_available);
-		return error;
-	}
 	data->frozen = 0;
 	data->ready = 0;
 	data->platform_support = 0;
 
-	return 0;
+ Unlock:
+	mutex_unlock(&pm_mutex);
+
+	return error;
 }
 
 static int snapshot_release(struct inode *inode, struct file *filp)
 {
 	struct snapshot_data *data;
 
+	mutex_lock(&pm_mutex);
+
 	swsusp_free();
 	free_basic_memory_bitmaps();
 	data = filp->private_data;
 	free_all_swap_pages(data->swap);
-	if (data->frozen) {
-		mutex_lock(&pm_mutex);
+	if (data->frozen)
 		thaw_processes();
-		mutex_unlock(&pm_mutex);
-	}
 	pm_notifier_call_chain(data->mode == O_WRONLY ?
 			PM_POST_HIBERNATION : PM_POST_RESTORE);
 	atomic_inc(&snapshot_device_available);
+
+	mutex_unlock(&pm_mutex);
+
 	return 0;
 }
 
@@ -135,9 +144,13 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
 	struct snapshot_data *data;
 	ssize_t res;
 
+	mutex_lock(&pm_mutex);
+
 	data = filp->private_data;
-	if (!data->ready)
-		return -ENODATA;
+	if (!data->ready) {
+		res = -ENODATA;
+		goto Unlock;
+	}
 	res = snapshot_read_next(&data->handle, count);
 	if (res > 0) {
 		if (copy_to_user(buf, data_of(data->handle), res))
@@ -145,6 +158,10 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
 		else
 			*offp = data->handle.offset;
 	}
+
+ Unlock:
+	mutex_unlock(&pm_mutex);
+
 	return res;
 }
 
@@ -154,6 +171,8 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
 	struct snapshot_data *data;
 	ssize_t res;
 
+	mutex_lock(&pm_mutex);
+
 	data = filp->private_data;
 	res = snapshot_write_next(&data->handle, count);
 	if (res > 0) {
@@ -162,6 +181,9 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
 		else
 			*offp = data->handle.offset;
 	}
+
+	mutex_unlock(&pm_mutex);
+
 	return res;
 }
 
@@ -180,16 +202,16 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	data = filp->private_data;
+	if (!mutex_trylock(&pm_mutex))
+		return -EBUSY;
 
-	lock_kernel();
+	data = filp->private_data;
 
 	switch (cmd) {
 
 	case SNAPSHOT_FREEZE:
 		if (data->frozen)
 			break;
-		mutex_lock(&pm_mutex);
 		printk("Syncing filesystems ... ");
 		sys_sync();
 		printk("done.\n");
@@ -197,7 +219,6 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		error = freeze_processes();
 		if (error)
 			thaw_processes();
-		mutex_unlock(&pm_mutex);
 		if (!error)
 			data->frozen = 1;
 		break;
@@ -205,9 +226,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 	case SNAPSHOT_UNFREEZE:
 		if (!data->frozen || data->ready)
 			break;
-		mutex_lock(&pm_mutex);
 		thaw_processes();
-		mutex_unlock(&pm_mutex);
 		data->frozen = 0;
 		break;
 
@@ -310,16 +329,11 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 			error = -EPERM;
 			break;
 		}
-		if (!mutex_trylock(&pm_mutex)) {
-			error = -EBUSY;
-			break;
-		}
 		/*
 		 * Tasks are frozen and the notifiers have been called with
 		 * PM_HIBERNATION_PREPARE
 		 */
 		error = suspend_devices_and_enter(PM_SUSPEND_MEM);
-		mutex_unlock(&pm_mutex);
 		break;
 
 	case SNAPSHOT_PLATFORM_SUPPORT:
@@ -392,7 +406,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
 		error = -ENOTTY;
 
 	}
-	unlock_kernel();
+
+	mutex_unlock(&pm_mutex);
+
 	return error;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 98abed02007b19bbfd68b6d06a5485afc3eeb01b Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 19 Mar 2008 19:24:59 -0700
Subject: do_wait reorganization

This breaks out the guts of do_wait into three subfunctions.
The control flow is less nonobvious without so much goto.
do_wait_thread and ptrace_do_wait contain the main work of the outer loop.
wait_consider_task contains the main work of the inner loop.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 kernel/exit.c | 215 ++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 135 insertions(+), 80 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index ceb25878283..7453356a961 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1238,7 +1238,7 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_zombie(struct task_struct *p, int noreap,
+static int wait_task_zombie(struct task_struct *p, int options,
 			    struct siginfo __user *infop,
 			    int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1246,7 +1246,10 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
 	int retval, status, traced;
 	pid_t pid = task_pid_vnr(p);
 
-	if (unlikely(noreap)) {
+	if (!likely(options & WEXITED))
+		return 0;
+
+	if (unlikely(options & WNOWAIT)) {
 		uid_t uid = p->uid;
 		int exit_code = p->exit_code;
 		int why, status;
@@ -1397,13 +1400,16 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
  * released the lock and the system call should return.
  */
 static int wait_task_stopped(struct task_struct *p,
-			     int noreap, struct siginfo __user *infop,
+			     int options, struct siginfo __user *infop,
 			     int __user *stat_addr, struct rusage __user *ru)
 {
 	int retval, exit_code, why;
 	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
 
+	if (!(p->ptrace & PT_PTRACED) && !(options & WUNTRACED))
+		return 0;
+
 	exit_code = 0;
 	spin_lock_irq(&p->sighand->siglock);
 
@@ -1421,7 +1427,7 @@ static int wait_task_stopped(struct task_struct *p,
 	if (!exit_code)
 		goto unlock_sig;
 
-	if (!noreap)
+	if (!unlikely(options & WNOWAIT))
 		p->exit_code = 0;
 
 	uid = p->uid;
@@ -1442,7 +1448,7 @@ unlock_sig:
 	why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
 	read_unlock(&tasklist_lock);
 
-	if (unlikely(noreap))
+	if (unlikely(options & WNOWAIT))
 		return wait_noreap_copyout(p, pid, uid,
 					   why, exit_code,
 					   infop, ru);
@@ -1476,7 +1482,7 @@ unlock_sig:
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_continued(struct task_struct *p, int noreap,
+static int wait_task_continued(struct task_struct *p, int options,
 			       struct siginfo __user *infop,
 			       int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1484,6 +1490,9 @@ static int wait_task_continued(struct task_struct *p, int noreap,
 	pid_t pid;
 	uid_t uid;
 
+	if (!unlikely(options & WCONTINUED))
+		return 0;
+
 	if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
 		return 0;
 
@@ -1493,7 +1502,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
 		spin_unlock_irq(&p->sighand->siglock);
 		return 0;
 	}
-	if (!noreap)
+	if (!unlikely(options & WNOWAIT))
 		p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
 	spin_unlock_irq(&p->sighand->siglock);
 
@@ -1519,89 +1528,137 @@ static int wait_task_continued(struct task_struct *p, int noreap,
 	return retval;
 }
 
+/*
+ * Consider @p for a wait by @parent.
+ *
+ * -ECHILD should be in *@notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue;
+ * then *@notask_error is 0 if @p is an eligible child, or still -ECHILD.
+ */
+static int wait_consider_task(struct task_struct *parent,
+			      struct task_struct *p, int *notask_error,
+			      enum pid_type type, struct pid *pid, int options,
+			      struct siginfo __user *infop,
+			      int __user *stat_addr, struct rusage __user *ru)
+{
+	int ret = eligible_child(type, pid, options, p);
+	if (ret <= 0)
+		return ret;
+
+	if (p->exit_state == EXIT_DEAD)
+		return 0;
+
+	/*
+	 * We don't reap group leaders with subthreads.
+	 */
+	if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
+		return wait_task_zombie(p, options, infop, stat_addr, ru);
+
+	/*
+	 * It's stopped or running now, so it might
+	 * later continue, exit, or stop again.
+	 */
+	*notask_error = 0;
+
+	if (task_is_stopped_or_traced(p))
+		return wait_task_stopped(p, options, infop, stat_addr, ru);
+
+	return wait_task_continued(p, options, infop, stat_addr, ru);
+}
+
+/*
+ * Do the work of do_wait() for one thread in the group, @tsk.
+ *
+ * -ECHILD should be in *@notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue; then
+ * *@notask_error is 0 if there were any eligible children, or still -ECHILD.
+ */
+static int do_wait_thread(struct task_struct *tsk, int *notask_error,
+			  enum pid_type type, struct pid *pid, int options,
+			  struct siginfo __user *infop, int __user *stat_addr,
+			  struct rusage __user *ru)
+{
+	struct task_struct *p;
+
+	list_for_each_entry(p, &tsk->children, sibling) {
+		int ret = wait_consider_task(tsk, p, notask_error,
+					     type, pid, options,
+					     infop, stat_addr, ru);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
+			  enum pid_type type, struct pid *pid, int options,
+			  struct siginfo __user *infop, int __user *stat_addr,
+			  struct rusage __user *ru)
+{
+	struct task_struct *p;
+
+	/*
+	 * If we never saw an eligile child, check for children stolen by
+	 * ptrace.  We don't leave -ECHILD in *@notask_error if there are any,
+	 * because we will eventually be allowed to wait for them again.
+	 */
+	if (!*notask_error)
+		return 0;
+
+	list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
+		int ret = eligible_child(type, pid, options, p);
+		if (unlikely(ret < 0))
+			return ret;
+		if (ret) {
+			*notask_error = 0;
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
 static long do_wait(enum pid_type type, struct pid *pid, int options,
 		    struct siginfo __user *infop, int __user *stat_addr,
 		    struct rusage __user *ru)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	struct task_struct *tsk;
-	int flag, retval;
+	int retval;
 
 	add_wait_queue(&current->signal->wait_chldexit,&wait);
 repeat:
-	/* If there is nothing that can match our critier just get out */
+	/*
+	 * If there is nothing that can match our critiera just get out.
+	 * We will clear @retval to zero if we see any child that might later
+	 * match our criteria, even if we are not able to reap it yet.
+	 */
 	retval = -ECHILD;
 	if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
 		goto end;
 
-	/*
-	 * We will set this flag if we see any child that might later
-	 * match our criteria, even if we are not able to reap it yet.
-	 */
-	flag = retval = 0;
 	current->state = TASK_INTERRUPTIBLE;
 	read_lock(&tasklist_lock);
 	tsk = current;
 	do {
-		struct task_struct *p;
-
-		list_for_each_entry(p, &tsk->children, sibling) {
-			int ret = eligible_child(type, pid, options, p);
-			if (!ret)
-				continue;
-
-			if (unlikely(ret < 0)) {
-				retval = ret;
-			} else if (task_is_stopped_or_traced(p)) {
-				/*
-				 * It's stopped now, so it might later
-				 * continue, exit, or stop again.
-				 */
-				flag = 1;
-				if (!(p->ptrace & PT_PTRACED) &&
-				    !(options & WUNTRACED))
-					continue;
-
-				retval = wait_task_stopped(p,
-						(options & WNOWAIT), infop,
-						stat_addr, ru);
-			} else if (p->exit_state == EXIT_ZOMBIE &&
-					!delay_group_leader(p)) {
-				/*
-				 * We don't reap group leaders with subthreads.
-				 */
-				if (!likely(options & WEXITED))
-					continue;
-				retval = wait_task_zombie(p,
-						(options & WNOWAIT), infop,
-						stat_addr, ru);
-			} else if (p->exit_state != EXIT_DEAD) {
-				/*
-				 * It's running now, so it might later
-				 * exit, stop, or stop and then continue.
-				 */
-				flag = 1;
-				if (!unlikely(options & WCONTINUED))
-					continue;
-				retval = wait_task_continued(p,
-						(options & WNOWAIT), infop,
-						stat_addr, ru);
-			}
-			if (retval != 0) /* tasklist_lock released */
-				goto end;
-		}
-		if (!flag) {
-			list_for_each_entry(p, &tsk->ptrace_children,
-								ptrace_list) {
-				flag = eligible_child(type, pid, options, p);
-				if (!flag)
-					continue;
-				if (likely(flag > 0))
-					break;
-				retval = flag;
-				goto end;
-			}
+		int tsk_result = do_wait_thread(tsk, &retval,
+						type, pid, options,
+						infop, stat_addr, ru);
+		if (!tsk_result)
+			tsk_result = ptrace_do_wait(tsk, &retval,
+						    type, pid, options,
+						    infop, stat_addr, ru);
+		if (tsk_result) {
+			/*
+			 * tasklist_lock is unlocked and we have a final result.
+			 */
+			retval = tsk_result;
+			goto end;
 		}
+
 		if (options & __WNOTHREAD)
 			break;
 		tsk = next_thread(tsk);
@@ -1609,16 +1666,14 @@ repeat:
 	} while (tsk != current);
 	read_unlock(&tasklist_lock);
 
-	if (flag) {
-		if (options & WNOHANG)
-			goto end;
+	if (!retval && !(options & WNOHANG)) {
 		retval = -ERESTARTSYS;
-		if (signal_pending(current))
-			goto end;
-		schedule();
-		goto repeat;
+		if (!signal_pending(current)) {
+			schedule();
+			goto repeat;
+		}
 	}
-	retval = -ECHILD;
+
 end:
 	current->state = TASK_RUNNING;
 	remove_wait_queue(&current->signal->wait_chldexit,&wait);
-- 
cgit v1.2.3-70-g09d2


From f470021adb9190819c03d6d8c5c860a17480aa6d Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Mon, 24 Mar 2008 18:36:23 -0700
Subject: ptrace children revamp

ptrace no longer fiddles with the children/sibling links, and the
old ptrace_children list is gone.  Now ptrace, whether of one's own
children or another's via PTRACE_ATTACH, just uses the new ptraced
list instead.

There should be no user-visible difference that matters.  The only
change is the order in which do_wait() sees multiple stopped
children and stopped ptrace attachees.  Since wait_task_stopped()
was changed earlier so it no longer reorders the children list, we
already know this won't cause any new problems.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 include/linux/init_task.h |   4 +-
 include/linux/sched.h     |  26 +++---
 kernel/exit.c             | 226 ++++++++++++++++++++++++----------------------
 kernel/fork.c             |   6 +-
 kernel/ptrace.c           |  37 +++++---
 5 files changed, 160 insertions(+), 139 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 9927a88674a..93c45acf249 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -140,8 +140,8 @@ extern struct group_info init_groups;
 		.nr_cpus_allowed = NR_CPUS,				\
 	},								\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
-	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
-	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
+	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
+	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
 	.real_parent	= &tsk,						\
 	.parent		= &tsk,						\
 	.children	= LIST_HEAD_INIT(tsk.children),			\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ba2f859c6e4..1941d8b5cf1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1062,12 +1062,6 @@ struct task_struct {
 #endif
 
 	struct list_head tasks;
-	/*
-	 * ptrace_list/ptrace_children forms the list of my children
-	 * that were stolen by a ptracer.
-	 */
-	struct list_head ptrace_children;
-	struct list_head ptrace_list;
 
 	struct mm_struct *mm, *active_mm;
 
@@ -1089,18 +1083,25 @@ struct task_struct {
 	/* 
 	 * pointers to (original) parent process, youngest child, younger sibling,
 	 * older sibling, respectively.  (p->father can be replaced with 
-	 * p->parent->pid)
+	 * p->real_parent->pid)
 	 */
-	struct task_struct *real_parent; /* real parent process (when being debugged) */
-	struct task_struct *parent;	/* parent process */
+	struct task_struct *real_parent; /* real parent process */
+	struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
 	/*
-	 * children/sibling forms the list of my children plus the
-	 * tasks I'm ptracing.
+	 * children/sibling forms the list of my natural children
 	 */
 	struct list_head children;	/* list of my children */
 	struct list_head sibling;	/* linkage in my parent's children list */
 	struct task_struct *group_leader;	/* threadgroup leader */
 
+	/*
+	 * ptraced is the list of tasks this task is using ptrace on.
+	 * This includes both natural children and PTRACE_ATTACH targets.
+	 * p->ptrace_entry is p's link on the p->parent->ptraced list.
+	 */
+	struct list_head ptraced;
+	struct list_head ptrace_entry;
+
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
 	struct list_head thread_group;
@@ -1876,9 +1877,6 @@ extern void wait_task_inactive(struct task_struct * p);
 #define wait_task_inactive(p)	do { } while (0)
 #endif
 
-#define remove_parent(p)	list_del_init(&(p)->sibling)
-#define add_parent(p)		list_add_tail(&(p)->sibling,&(p)->parent->children)
-
 #define next_task(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
 
 #define for_each_process(p) \
diff --git a/kernel/exit.c b/kernel/exit.c
index 7453356a961..1e909826a80 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -71,7 +71,7 @@ static void __unhash_process(struct task_struct *p)
 		__get_cpu_var(process_counts)--;
 	}
 	list_del_rcu(&p->thread_group);
-	remove_parent(p);
+	list_del_init(&p->sibling);
 }
 
 /*
@@ -152,6 +152,18 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 	put_task_struct(container_of(rhp, struct task_struct, rcu));
 }
 
+/*
+ * Do final ptrace-related cleanup of a zombie being reaped.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static void ptrace_release_task(struct task_struct *p)
+{
+	BUG_ON(!list_empty(&p->ptraced));
+	ptrace_unlink(p);
+	BUG_ON(!list_empty(&p->ptrace_entry));
+}
+
 void release_task(struct task_struct * p)
 {
 	struct task_struct *leader;
@@ -160,8 +172,7 @@ repeat:
 	atomic_dec(&p->user->processes);
 	proc_flush_task(p);
 	write_lock_irq(&tasklist_lock);
-	ptrace_unlink(p);
-	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
+	ptrace_release_task(p);
 	__exit_signal(p);
 
 	/*
@@ -315,9 +326,8 @@ static void reparent_to_kthreadd(void)
 
 	ptrace_unlink(current);
 	/* Reparent to init */
-	remove_parent(current);
 	current->real_parent = current->parent = kthreadd_task;
-	add_parent(current);
+	list_move_tail(&current->sibling, &current->real_parent->children);
 
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
 	current->exit_signal = SIGCHLD;
@@ -692,37 +702,71 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-static void
-reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
+/*
+ * Detach all tasks we were using ptrace on.
+ * Any that need to be release_task'd are put on the @dead list.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 {
-	if (p->pdeath_signal)
-		/* We already hold the tasklist_lock here.  */
-		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+	struct task_struct *p, *n;
 
-	/* Move the child from its dying parent to the new one.  */
-	if (unlikely(traced)) {
-		/* Preserve ptrace links if someone else is tracing this child.  */
-		list_del_init(&p->ptrace_list);
-		if (ptrace_reparented(p))
-			list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
-	} else {
-		/* If this child is being traced, then we're the one tracing it
-		 * anyway, so let go of it.
+	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
+		__ptrace_unlink(p);
+
+		if (p->exit_state != EXIT_ZOMBIE)
+			continue;
+
+		/*
+		 * If it's a zombie, our attachedness prevented normal
+		 * parent notification or self-reaping.  Do notification
+		 * now if it would have happened earlier.  If it should
+		 * reap itself, add it to the @dead list.  We can't call
+		 * release_task() here because we already hold tasklist_lock.
+		 *
+		 * If it's our own child, there is no notification to do.
 		 */
-		p->ptrace = 0;
-		remove_parent(p);
-		p->parent = p->real_parent;
-		add_parent(p);
+		if (!task_detached(p) && thread_group_empty(p)) {
+			if (!same_thread_group(p->real_parent, parent))
+				do_notify_parent(p, p->exit_signal);
+		}
 
-		if (task_is_traced(p)) {
+		if (task_detached(p)) {
 			/*
-			 * If it was at a trace stop, turn it into
-			 * a normal stop since it's no longer being
-			 * traced.
+			 * Mark it as in the process of being reaped.
 			 */
-			ptrace_untrace(p);
+			p->exit_state = EXIT_DEAD;
+			list_add(&p->ptrace_entry, dead);
 		}
 	}
+}
+
+/*
+ * Finish up exit-time ptrace cleanup.
+ *
+ * Called without locks.
+ */
+static void ptrace_exit_finish(struct task_struct *parent,
+			       struct list_head *dead)
+{
+	struct task_struct *p, *n;
+
+	BUG_ON(!list_empty(&parent->ptraced));
+
+	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
+		release_task(p);
+	}
+}
+
+static void reparent_thread(struct task_struct *p, struct task_struct *father)
+{
+	if (p->pdeath_signal)
+		/* We already hold the tasklist_lock here.  */
+		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+
+	list_move_tail(&p->sibling, &p->real_parent->children);
 
 	/* If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
@@ -737,7 +781,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 	/* If we'd notified the old parent about this child's death,
 	 * also notify the new parent.
 	 */
-	if (!traced && p->exit_state == EXIT_ZOMBIE &&
+	if (!ptrace_reparented(p) &&
+	    p->exit_state == EXIT_ZOMBIE &&
 	    !task_detached(p) && thread_group_empty(p))
 		do_notify_parent(p, p->exit_signal);
 
@@ -754,12 +799,15 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
 static void forget_original_parent(struct task_struct *father)
 {
 	struct task_struct *p, *n, *reaper = father;
-	struct list_head ptrace_dead;
-
-	INIT_LIST_HEAD(&ptrace_dead);
+	LIST_HEAD(ptrace_dead);
 
 	write_lock_irq(&tasklist_lock);
 
+	/*
+	 * First clean up ptrace if we were using it.
+	 */
+	ptrace_exit(father, &ptrace_dead);
+
 	do {
 		reaper = next_thread(reaper);
 		if (reaper == father) {
@@ -768,58 +816,19 @@ static void forget_original_parent(struct task_struct *father)
 		}
 	} while (reaper->flags & PF_EXITING);
 
-	/*
-	 * There are only two places where our children can be:
-	 *
-	 * - in our child list
-	 * - in our ptraced child list
-	 *
-	 * Search them and reparent children.
-	 */
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
-		int ptrace;
-
-		ptrace = p->ptrace;
-
-		/* if father isn't the real parent, then ptrace must be enabled */
-		BUG_ON(father != p->real_parent && !ptrace);
-
-		if (father == p->real_parent) {
-			/* reparent with a reaper, real father it's us */
-			p->real_parent = reaper;
-			reparent_thread(p, father, 0);
-		} else {
-			/* reparent ptraced task to its real parent */
-			__ptrace_unlink (p);
-			if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) &&
-			    thread_group_empty(p))
-				do_notify_parent(p, p->exit_signal);
-		}
-
-		/*
-		 * if the ptraced child is a detached zombie we must collect
-		 * it before we exit, or it will remain zombie forever since
-		 * we prevented it from self-reap itself while it was being
-		 * traced by us, to be able to see it in wait4.
-		 */
-		if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p)))
-			list_add(&p->ptrace_list, &ptrace_dead);
-	}
-
-	list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
 		p->real_parent = reaper;
-		reparent_thread(p, father, 1);
+		if (p->parent == father) {
+			BUG_ON(p->ptrace);
+			p->parent = p->real_parent;
+		}
+		reparent_thread(p, father);
 	}
 
 	write_unlock_irq(&tasklist_lock);
 	BUG_ON(!list_empty(&father->children));
-	BUG_ON(!list_empty(&father->ptrace_children));
-
-	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) {
-		list_del_init(&p->ptrace_list);
-		release_task(p);
-	}
 
+	ptrace_exit_finish(father, &ptrace_dead);
 }
 
 /*
@@ -1180,13 +1189,6 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
 			return 0;
 	}
 
-	/*
-	 * Do not consider detached threads that are
-	 * not ptraced:
-	 */
-	if (task_detached(p) && !p->ptrace)
-		return 0;
-
 	/* Wait for all children (clone and not) if __WALL is set;
 	 * otherwise, wait for clone children *only* if __WCLONE is
 	 * set; otherwise, wait for non-clone children *only*.  (Note:
@@ -1399,7 +1401,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
  * the lock and this task is uninteresting.  If we return nonzero, we have
  * released the lock and the system call should return.
  */
-static int wait_task_stopped(struct task_struct *p,
+static int wait_task_stopped(int ptrace, struct task_struct *p,
 			     int options, struct siginfo __user *infop,
 			     int __user *stat_addr, struct rusage __user *ru)
 {
@@ -1407,7 +1409,7 @@ static int wait_task_stopped(struct task_struct *p,
 	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
 
-	if (!(p->ptrace & PT_PTRACED) && !(options & WUNTRACED))
+	if (!(options & WUNTRACED))
 		return 0;
 
 	exit_code = 0;
@@ -1416,7 +1418,7 @@ static int wait_task_stopped(struct task_struct *p,
 	if (unlikely(!task_is_stopped_or_traced(p)))
 		goto unlock_sig;
 
-	if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0)
+	if (!ptrace && p->signal->group_stop_count > 0)
 		/*
 		 * A group stop is in progress and this is the group leader.
 		 * We won't report until all threads have stopped.
@@ -1445,7 +1447,7 @@ unlock_sig:
 	 */
 	get_task_struct(p);
 	pid = task_pid_vnr(p);
-	why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
+	why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
 	read_unlock(&tasklist_lock);
 
 	if (unlikely(options & WNOWAIT))
@@ -1536,7 +1538,7 @@ static int wait_task_continued(struct task_struct *p, int options,
  * Returns zero if the search for a child should continue;
  * then *@notask_error is 0 if @p is an eligible child, or still -ECHILD.
  */
-static int wait_consider_task(struct task_struct *parent,
+static int wait_consider_task(struct task_struct *parent, int ptrace,
 			      struct task_struct *p, int *notask_error,
 			      enum pid_type type, struct pid *pid, int options,
 			      struct siginfo __user *infop,
@@ -1546,6 +1548,15 @@ static int wait_consider_task(struct task_struct *parent,
 	if (ret <= 0)
 		return ret;
 
+	if (likely(!ptrace) && unlikely(p->ptrace)) {
+		/*
+		 * This child is hidden by ptrace.
+		 * We aren't allowed to see it now, but eventually we will.
+		 */
+		*notask_error = 0;
+		return 0;
+	}
+
 	if (p->exit_state == EXIT_DEAD)
 		return 0;
 
@@ -1562,7 +1573,8 @@ static int wait_consider_task(struct task_struct *parent,
 	*notask_error = 0;
 
 	if (task_is_stopped_or_traced(p))
-		return wait_task_stopped(p, options, infop, stat_addr, ru);
+		return wait_task_stopped(ptrace, p, options,
+					 infop, stat_addr, ru);
 
 	return wait_task_continued(p, options, infop, stat_addr, ru);
 }
@@ -1583,11 +1595,16 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
 	struct task_struct *p;
 
 	list_for_each_entry(p, &tsk->children, sibling) {
-		int ret = wait_consider_task(tsk, p, notask_error,
-					     type, pid, options,
-					     infop, stat_addr, ru);
-		if (ret)
-			return ret;
+		/*
+		 * Do not consider detached threads.
+		 */
+		if (!task_detached(p)) {
+			int ret = wait_consider_task(tsk, 0, p, notask_error,
+						     type, pid, options,
+						     infop, stat_addr, ru);
+			if (ret)
+				return ret;
+		}
 	}
 
 	return 0;
@@ -1601,21 +1618,16 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
 	struct task_struct *p;
 
 	/*
-	 * If we never saw an eligile child, check for children stolen by
-	 * ptrace.  We don't leave -ECHILD in *@notask_error if there are any,
-	 * because we will eventually be allowed to wait for them again.
+	 * Traditionally we see ptrace'd stopped tasks regardless of options.
 	 */
-	if (!*notask_error)
-		return 0;
+	options |= WUNTRACED;
 
-	list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
-		int ret = eligible_child(type, pid, options, p);
-		if (unlikely(ret < 0))
+	list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
+		int ret = wait_consider_task(tsk, 1, p, notask_error,
+					     type, pid, options,
+					     infop, stat_addr, ru);
+		if (ret)
 			return ret;
-		if (ret) {
-			*notask_error = 0;
-			return 0;
-		}
 	}
 
 	return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 4bd2f516401..adefc1131f2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1125,8 +1125,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 */
 	p->group_leader = p;
 	INIT_LIST_HEAD(&p->thread_group);
-	INIT_LIST_HEAD(&p->ptrace_children);
-	INIT_LIST_HEAD(&p->ptrace_list);
+	INIT_LIST_HEAD(&p->ptrace_entry);
+	INIT_LIST_HEAD(&p->ptraced);
 
 	/* Now that the task is set up, run cgroup callbacks if
 	 * necessary. We need to run them before the task is visible
@@ -1198,7 +1198,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	}
 
 	if (likely(p->pid)) {
-		add_parent(p);
+		list_add_tail(&p->sibling, &p->real_parent->children);
 		if (unlikely(p->ptrace & PT_PTRACED))
 			__ptrace_link(p, current->parent);
 
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e337390fce0..8392a9da645 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -33,13 +33,9 @@
  */
 void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
 {
-	BUG_ON(!list_empty(&child->ptrace_list));
-	if (child->parent == new_parent)
-		return;
-	list_add(&child->ptrace_list, &child->parent->ptrace_children);
-	remove_parent(child);
+	BUG_ON(!list_empty(&child->ptrace_entry));
+	list_add(&child->ptrace_entry, &new_parent->ptraced);
 	child->parent = new_parent;
-	add_parent(child);
 }
  
 /*
@@ -73,12 +69,8 @@ void __ptrace_unlink(struct task_struct *child)
 	BUG_ON(!child->ptrace);
 
 	child->ptrace = 0;
-	if (ptrace_reparented(child)) {
-		list_del_init(&child->ptrace_list);
-		remove_parent(child);
-		child->parent = child->real_parent;
-		add_parent(child);
-	}
+	child->parent = child->real_parent;
+	list_del_init(&child->ptrace_entry);
 
 	if (task_is_traced(child))
 		ptrace_untrace(child);
@@ -492,15 +484,34 @@ int ptrace_traceme(void)
 	/*
 	 * Are we already being traced?
 	 */
+repeat:
 	task_lock(current);
 	if (!(current->ptrace & PT_PTRACED)) {
+		/*
+		 * See ptrace_attach() comments about the locking here.
+		 */
+		unsigned long flags;
+		if (!write_trylock_irqsave(&tasklist_lock, flags)) {
+			task_unlock(current);
+			do {
+				cpu_relax();
+			} while (!write_can_lock(&tasklist_lock));
+			goto repeat;
+		}
+
 		ret = security_ptrace(current->parent, current,
 				      PTRACE_MODE_ATTACH);
+
 		/*
 		 * Set the ptrace bit in the process ptrace flags.
+		 * Then link us on our parent's ptraced list.
 		 */
-		if (!ret)
+		if (!ret) {
 			current->ptrace |= PT_PTRACED;
+			__ptrace_link(current, current->real_parent);
+		}
+
+		write_unlock_irqrestore(&tasklist_lock, flags);
 	}
 	task_unlock(current);
 	return ret;
-- 
cgit v1.2.3-70-g09d2


From 14dd0b81414a58caf0296dbeace016bb0a5d11ab Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sun, 30 Mar 2008 18:41:25 -0700
Subject: do_wait: return security_task_wait() error code in place of -ECHILD

This reverts the effect of commit f2cc3eb133baa2e9dc8efd40f417106b2ee520f3
"do_wait: fix security checks".  That change reverted the effect of commit
73243284463a761e04d69d22c7516b2be7de096c.  The rationale for the original
commit still stands.  The inconsistent treatment of children hidden by
ptrace was an unintended omission in the original change and in no way
invalidates its purpose.

This makes do_wait return the error returned by security_task_wait()
(usually -EACCES) in place of -ECHILD when there are some children the
caller would be able to wait for if not for the permission failure.  A
permission error will give the user a clue to look for security policy
problems, rather than for mysterious wait bugs.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 kernel/exit.c | 30 ++++++++++++++++++++----------
 1 file changed, 20 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 1e909826a80..a2af6cac823 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1199,14 +1199,10 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
 		return 0;
 
 	err = security_task_wait(p);
-	if (likely(!err))
-		return 1;
+	if (err)
+		return err;
 
-	if (type != PIDTYPE_PID)
-		return 0;
-	/* This child was explicitly requested, abort */
-	read_unlock(&tasklist_lock);
-	return err;
+	return 1;
 }
 
 static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
@@ -1536,7 +1532,8 @@ static int wait_task_continued(struct task_struct *p, int options,
  * -ECHILD should be in *@notask_error before the first call.
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue;
- * then *@notask_error is 0 if @p is an eligible child, or still -ECHILD.
+ * then *@notask_error is 0 if @p is an eligible child,
+ * or another error from security_task_wait(), or still -ECHILD.
  */
 static int wait_consider_task(struct task_struct *parent, int ptrace,
 			      struct task_struct *p, int *notask_error,
@@ -1545,9 +1542,21 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
 			      int __user *stat_addr, struct rusage __user *ru)
 {
 	int ret = eligible_child(type, pid, options, p);
-	if (ret <= 0)
+	if (!ret)
 		return ret;
 
+	if (unlikely(ret < 0)) {
+		/*
+		 * If we have not yet seen any eligible child,
+		 * then let this error code replace -ECHILD.
+		 * A permission error will give the user a clue
+		 * to look for security policy problems, rather
+		 * than for mysterious wait bugs.
+		 */
+		if (*notask_error)
+			*notask_error = ret;
+	}
+
 	if (likely(!ptrace) && unlikely(p->ptrace)) {
 		/*
 		 * This child is hidden by ptrace.
@@ -1585,7 +1594,8 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
  * -ECHILD should be in *@notask_error before the first call.
  * Returns nonzero for a final return, when we have unlocked tasklist_lock.
  * Returns zero if the search for a child should continue; then
- * *@notask_error is 0 if there were any eligible children, or still -ECHILD.
+ * *@notask_error is 0 if there were any eligible children,
+ * or another error from security_task_wait(), or still -ECHILD.
  */
 static int do_wait_thread(struct task_struct *tsk, int *notask_error,
 			  enum pid_type type, struct pid *pid, int options,
-- 
cgit v1.2.3-70-g09d2


From 666f164f4fbfa78bd00fb4b74788b42a39842c64 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Tue, 8 Apr 2008 23:12:30 -0700
Subject: fix dangling zombie when new parent ignores children

This fixes an arcane bug that we think was a regression introduced
by commit b2b2cbc4b2a2f389442549399a993a8306420baf.  When a parent
ignores SIGCHLD (or uses SA_NOCLDWAIT), its children would self-reap
but they don't if it's using ptrace on them.  When the parent thread
later exits and ceases to ptrace a child but leaves other live
threads in the parent's thread group, any zombie children are left
dangling.  The fix makes them self-reap then, as they would have
done earlier if ptrace had not been in use.

Signed-off-by: Roland McGrath <roland@redhat.com>
---
 kernel/exit.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index a2af6cac823..93d2711b938 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -702,6 +702,23 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
+/*
+ * Return nonzero if @parent's children should reap themselves.
+ *
+ * Called with write_lock_irq(&tasklist_lock) held.
+ */
+static int ignoring_children(struct task_struct *parent)
+{
+	int ret;
+	struct sighand_struct *psig = parent->sighand;
+	unsigned long flags;
+	spin_lock_irqsave(&psig->siglock, flags);
+	ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
+	       (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
+	spin_unlock_irqrestore(&psig->siglock, flags);
+	return ret;
+}
+
 /*
  * Detach all tasks we were using ptrace on.
  * Any that need to be release_task'd are put on the @dead list.
@@ -711,6 +728,7 @@ static void exit_mm(struct task_struct * tsk)
 static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 {
 	struct task_struct *p, *n;
+	int ign = -1;
 
 	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
 		__ptrace_unlink(p);
@@ -726,10 +744,18 @@ static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 		 * release_task() here because we already hold tasklist_lock.
 		 *
 		 * If it's our own child, there is no notification to do.
+		 * But if our normal children self-reap, then this child
+		 * was prevented by ptrace and we must reap it now.
 		 */
 		if (!task_detached(p) && thread_group_empty(p)) {
 			if (!same_thread_group(p->real_parent, parent))
 				do_notify_parent(p, p->exit_signal);
+			else {
+				if (ign < 0)
+					ign = ignoring_children(parent);
+				if (ign)
+					p->exit_signal = -1;
+			}
 		}
 
 		if (task_detached(p)) {
-- 
cgit v1.2.3-70-g09d2


From c349e0a01c3e0f70913db6a5bb61ab204e0602de Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 15 Apr 2008 22:39:31 +0200
Subject: ftrace: do not trace scheduler functions

do not trace scheduler functions - it's still a bit fragile
and can lock up with:

  http://redhat.com/~mingo/misc/config-Thu_Jul_17_13_34_52_CEST_2008

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 0a7ed838984..985ddb7da4d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -11,8 +11,6 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
 	    hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
 	    notifier.o ksysfs.o pm_qos_params.o sched_clock.o
 
-CFLAGS_REMOVE_sched.o = -mno-spe
-
 ifdef CONFIG_FTRACE
 # Do not trace debug files and internal ftrace files
 CFLAGS_REMOVE_lockdep.o = -pg
@@ -21,6 +19,7 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
+CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
 
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
-- 
cgit v1.2.3-70-g09d2